From 7612fd4a8828c9e670a1c6e8ed8684de5e4952d6 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 25 Oct 2024 13:49:06 +0200 Subject: [PATCH 1/5] Added Pan1c prefix instead of pan1c --- Snakefile | 200 ++++++++++++++++++++++++++---------------------------- 1 file changed, 97 insertions(+), 103 deletions(-) diff --git a/Snakefile b/Snakefile index fbbc6ef..12975d7 100644 --- a/Snakefile +++ b/Snakefile @@ -36,17 +36,17 @@ nHAP = len(SAMPLES) with gzip.open("data/haplotypes/"+config['reference'], "r") as handle: CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"] -graph_tools = ["pan1c"] + (config["get_MC"] == "True")*["MC"] +graph_tools = ["PGGB"] + (config["get_MC"] == "True")*["MC"] # Adding optionnal output based on config.yaml, using the following function def which_analysis(): ## Default analysis analysis_inputs = [ - expand("output/stats/{gtool}."+config['name']+".core.stats.tsv", gtool=graph_tools), # core stats - expand("output/panacus.reports/{gtool}."+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST, gtool=graph_tools), # panacus histgrowth - expand("output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST, gtool=graph_tools), # visualizations from odgi on chromosome graphs - expand("output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools) # chromosomes graph statistics + expand("output/stats/Pan1c.{gtool}."+config['name']+".core.stats.tsv", gtool=graph_tools), # core stats + expand("output/panacus.reports/Pan1c.{gtool}."+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST, gtool=graph_tools), # panacus histgrowth + expand("output/chrGraphs.figs/Pan1c.{gtool}."+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST, gtool=graph_tools), # visualizations from odgi on chromosome graphs + expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools) # chromosomes graph statistics ] ## Optionals analysis steps @@ -55,41 +55,35 @@ def which_analysis(): if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly analysis_inputs.append( - expand("output/asm.syri.figs/"+config['name']+".{haplotype}.syri.{tool}.png", haplotype=SAMPLES_NOREF, tool=["mm2"]) + expand("output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_{tool}.png", haplotype=SAMPLES_NOREF, tool=["mm2"]) ) if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input analysis_inputs.append( - expand("output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", chromosome=CHRLIST) + expand("output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png", chromosome=CHRLIST) ) if config["run_Quast"] == "True": # Running Quast on input haplotypes analysis_inputs.append( - "output/"+config['name']+".quast.report.html" + "output/Pan1c."+config['name']+".quast.report.html" ) if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure analysis_inputs.append( - expand("output/chr.contig/{haplotype}.contig.png", haplotype=CHRLIST) + expand("output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png", chromosome=CHRLIST) ) if config["create_report"] == "True": # Creating report (need contig) analysis_inputs.append( - expand("output/{gtool}."+config['name']+".report.md", gtool=graph_tools) + expand("output/Pan1c.{gtool}."+config['name']+".report.md", gtool=graph_tools) ) - analysis_inputs.append("output/report_data/"+config['name']+".assembly.json") - analysis_inputs.append("output/report_data/"+config['name']+".graph.json") + analysis_inputs.append("output/report_data/Pan1c."+config['name']+".assembly.json") + analysis_inputs.append("output/report_data/Pan1c."+config['name']+".graph.json") if config["get_VCF"] == "True": # VCF from the final graph against the "reference" analysis_inputs.append( expand("output/{gtool}.vcf.figs", gtool=graph_tools) ) - analysis_inputs.append("output/report_data/"+config['name']+".var.json") + analysis_inputs.append("output/report_data/Pan1c."+config['name']+".var.json") return analysis_inputs -""" -Functions --------------------------------------------------------------------------------------- -""" -def get_mem_mb(wildcards, attempt, threads, multiplier=config["mem_multiplier"]): - return attempt * multiplier * threads - """ Rules ------------------------------------------------------------------------------------------- """ @@ -97,8 +91,8 @@ Rules ------------------------------------------------------------------------ # Main target rule rule all: input: - expand("output/{gtool}."+config['name']+".gfa.gz", gtool=graph_tools), # Final graph (main output) - "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line) + expand("output/Pan1c.{gtool}."+config['name']+".gfa.gz", gtool=graph_tools), # Final graph (main output) + "output/Pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line) which_analysis() """ @@ -153,7 +147,7 @@ rule quast_stats: fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), ref="data/haplotypes/"+config['reference'] output: - report="output/"+config['name']+".quast.report.html" + report="output/Pan1c."+config['name']+".quast.report.html" threads: 16 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000 @@ -213,13 +207,13 @@ rule assemblathon_stats: mv data/haplotypes/{wildcards.haplotype}.csv {output.csv} """ -rule contig_position: +rule contig_positions: # Produce figures with contig positions input: fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz", fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai" output: - fig="output/chr.contig/{chromosome}.contig.png", + fig="output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png", outdir=temp(directory("output/chr.contig/{chromosome}")) threads: 1 resources: @@ -290,8 +284,8 @@ rule SyRI_on_ASM_mm2: ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" output: - fig="output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", - vcf="data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz" + fig="output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_mm2.png", + vcf="data/asm.syri.mm2/Pan1c."+config['name']+".{haplotype}.syri.mm2.vcf.gz" log: cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.mm2.cmd.log", time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.mm2.time.log" @@ -335,8 +329,8 @@ rule SyRI_on_ASM_wfm: ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" output: - fig="output/asm.syri.figs/"+config['name']+".{haplotype}.syri.wfm.png", - vcf="data/asm.syri.wfm/"+config['name']+".{haplotype}.syri.wfm.vcf.gz" + fig="output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_wfm.png", + vcf="data/asm.syri.wfm/Pan1c."+config['name']+".{haplotype}.syri.wfm.vcf.gz" log: cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.wfm.cmd.log", time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.wfm.time.log" @@ -384,19 +378,19 @@ def asm_json_inputs(wildcards): if config["get_contig_pos"] == "True": sections["contig_pos"] = expand( - "output/chr.contig/{chromosome}.contig.png", + "output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png", chromosome=CHRLIST ) if config["get_ASMs_SyRI"] == "True": sections["SyRI_on_ASMs_figs"] = expand( - "output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", + "output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_mm2.png", haplotype=SAMPLES_NOREF ) if config["get_chrInputs_SyRI"] == "True": sections["SyRI_on_chrInputs_figs"] = expand( - "output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", + "output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png", chromosome=CHRLIST ) @@ -407,8 +401,8 @@ rule asm_json: input: unpack(asm_json_inputs) output: - json="output/report_data/"+config['name']+".assembly.json", - merged="output/report_data/"+config['name']+".assemblathon_stats.tsv" + json="output/report_data/Pan1c."+config['name']+".assembly.json", + merged="output/report_data/Pan1c."+config['name']+".assemblathon_stats.tsv" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000 @@ -444,7 +438,7 @@ rule SyRI_on_chrInput: input: fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' output: - fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png" + fig="output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png" threads: 8 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 12000 @@ -497,10 +491,10 @@ rule wfmash_on_chr: fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai' output: - mapping=temp("data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.mapping.paf"), - aln=temp("data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.aln.paf"), - mapping_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.mapping.paf.gz", - aln_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.wfmash.aln.paf.gz" + mapping=temp("data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.wfmash.mapping.paf"), + aln=temp("data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.wfmash.aln.paf"), + mapping_gz="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.wfmash.mapping.paf.gz", + aln_gz="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.wfmash.aln.paf.gz" threads: 16 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000 @@ -553,7 +547,7 @@ rule seqwish: fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', aln=rules.wfmash_on_chr.output.aln_gz output: - gfa_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfa.gz" + gfa_gz="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.seqwish.gfa.gz" threads: 8 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000 @@ -585,8 +579,8 @@ rule gfaffix_on_chr: input: rules.seqwish.output.gfa_gz output: - gfa_gz="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfaffixD.gfa.gz", - transform="data/chrGraphs/pan1c.{chromosome}/pan1c.{chromosome}.seqwish.gfaffixD.transform.txt" + gfa_gz="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.seqwish.gfaffixD.gfa.gz", + transform="data/chrGraphs/PGGB.{chromosome}/Pan1c."+config['name']+".{chromosome}.seqwish.gfaffixD.transform.txt" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 24000 @@ -617,10 +611,10 @@ rule gfaffix_on_chr: rule odgi_postprocessing: # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph input: - tags="output/pan1c."+config['name']+".gfa.metadata", + tags="output/Pan1c."+config['name']+".gfa.metadata", gfa_gz=rules.gfaffix_on_chr.output.gfa_gz output: - gfa_gz='data/chrGraphs/pan1c.'+config['name']+'.{chromosome}.gfa.gz' + gfa_gz="data/chrGraphs/Pan1c.PGGB."+config['name']+".{chromosome}.gfa.gz" threads: 8 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000 @@ -686,10 +680,10 @@ rule odgi_postprocessing: rule MC_graph: input: - tags="output/pan1c."+config['name']+".gfa.metadata", + tags="output/Pan1c."+config['name']+".gfa.metadata", fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' output: - gfa_gz='data/chrGraphs/MC.'+config['name']+'.{chromosome}.gfa.gz' + gfa_gz='data/chrGraphs/Pan1c.MC.'+config['name']+'.{chromosome}.gfa.gz' threads: 16 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000 @@ -740,9 +734,9 @@ rule MC_graph: rule generate_graph_list: # Generate a text file containing all created graphs input: - gfas=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST) + gfas=expand('data/chrGraphs/Pan1c.{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST) output: - "data/chrGraphs/graphsList.{gtool}.txt" + temp("data/chrGraphs/graphsList.{gtool}.txt") threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000 @@ -756,13 +750,13 @@ rule graph_squeeze: # Using odgi to merge every subgraphs into a final one input: glist="data/chrGraphs/graphsList.{gtool}.txt", - tags="output/pan1c."+config['name']+".gfa.metadata", - graphs=expand('data/chrGraphs/{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST) + tags="output/Pan1c."+config['name']+".gfa.metadata", + graphs=expand('data/chrGraphs/Pan1c.{{gtool}}.'+config['name']+'.{chromosome}.tmp.gfa', chromosome=CHRLIST) output: - gfa_gz="output/{gtool}."+config['name']+".gfa.gz" + gfa_gz="output/Pan1c.{gtool}."+config['name']+".gfa.gz" log: - cmd="logs/squeeze/{gtool}."+config['name']+".squeeze.cmd.log", - time="logs/squeeze/{gtool}."+config['name']+".squeeze.time.log", + cmd="logs/squeeze/Pan1c.{gtool}."+config['name']+".squeeze.cmd.log", + time="logs/squeeze/Pan1c.{gtool}."+config['name']+".squeeze.time.log", threads: 16 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000 @@ -794,10 +788,10 @@ rule graph_squeeze: rule graph_stats: # Using GFAstats to produce stats on every chromosome graphs input: - graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.gfa.gz' + graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.gfa.gz' output: - genstats="output/stats/chrGraphs.{gtool}/{gtool}."+config['name']+".{chromosome}.general.stats.tsv", - pathstats="output/stats/chrGraphs.{gtool}/{gtool}."+config['name']+".{chromosome}.path.stats.tsv" + genstats="output/stats/chrGraphs.{gtool}/Pan1c.{gtool}."+config['name']+".{chromosome}.general.stats.tsv", + pathstats="output/stats/chrGraphs.{gtool}/Pan1c.{gtool}."+config['name']+".{chromosome}.path.stats.tsv" threads: 4 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000 @@ -815,10 +809,10 @@ rule graph_stats: rule graph_figs: # Creating figures using odgi viz input: - graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa' + graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa' output: - oneDviz="output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.1Dviz.png", - pcov="output/chrGraphs.figs/{gtool}."+config['name']+".{chromosome}.pcov.png" + oneDviz="output/chrGraphs.figs/Pan1c.{gtool}."+config['name']+".{chromosome}.1Dviz.png", + pcov="output/chrGraphs.figs/Pan1c.{gtool}."+config['name']+".{chromosome}.pcov.png" threads: 4 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 4000 @@ -841,10 +835,10 @@ rule graph_figs: rule aggregate_graphs_stats: # Reading and merging all stats files from chromosome graphs into a .tsv. input: - genstats=expand("output/stats/chrGraphs.{{gtool}}/{{gtool}}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) + genstats=expand("output/stats/chrGraphs.{{gtool}}/Pan1c.{{gtool}}."+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST) output: - genstats="output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", - pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv" + genstats="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv", + pathstats="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000 @@ -865,8 +859,8 @@ rule get_graph_tags: input: "config.yaml" output: - md="output/pan1c."+config['name']+".gfa.metadata", - json="output/report_data/"+config['name']+".tags.json" + md="output/Pan1c."+config['name']+".gfa.metadata", + json="output/report_data/Pan1c."+config['name']+".tags.json" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 8000 @@ -883,9 +877,9 @@ rule get_graph_tags: rule pggb_input_stats: # Produces statistics on pggb input sequences input: - flag="output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv" + flag="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv" output: - "output/stats/{gtool}."+config['name']+".chrInput.stats.tsv" + "output/stats/Pan1c.{gtool}."+config['name']+".chrInput.stats.tsv" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000 @@ -901,10 +895,10 @@ rule pggb_input_stats: rule core_statistics: # Aggregate chrInput, chrGraph and pggb statistics into a single tsv input: - chrInputStats = "output/stats/{gtool}."+config['name']+".chrInput.stats.tsv", - chrGraphStats = "output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv" + chrInputStats = "output/stats/Pan1c.{gtool}."+config['name']+".chrInput.stats.tsv", + chrGraphStats = "output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv" output: - tsv = "output/stats/{gtool}."+config['name']+".core.stats.tsv" + tsv = "output/stats/Pan1c.{gtool}."+config['name']+".core.stats.tsv" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000 @@ -921,11 +915,11 @@ rule core_statistics: rule graph_json: # Produce the Graph JSON for Pan1c QC input: - genstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools), - pathstats = expand("output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv", gtool=graph_tools), - odgifigs = expand("output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png", gtool=graph_tools, chromosome=CHRLIST) + genstats = expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.general.stats.tsv", gtool=graph_tools), + pathstats = expand("output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv", gtool=graph_tools), + odgifigs = expand("output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png", gtool=graph_tools, chromosome=CHRLIST) output: - json="output/report_data/"+config['name']+".graph.json" + json="output/report_data/Pan1c."+config['name']+".graph.json" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000 @@ -968,12 +962,12 @@ rule get_pav: rule panacus_stats: # Produces panacus reports for a chromosome graph input: - graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa' + graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa' output: - html='output/panacus.reports/{gtool}.'+config['name']+'.{chromosome}.histgrowth.html' + html='output/panacus.reports/Pan1c.{gtool}.'+config['name']+'.{chromosome}.histgrowth.html' log: - cmd="logs/panacus/{gtool}.{chromosome}.panacus.cmd.log", - time="logs/panacus/{gtool}.{chromosome}.panacus.time.log" + cmd="logs/panacus/Pan1c.{gtool}.{chromosome}.panacus.cmd.log", + time="logs/panacus/Pan1c.{gtool}.{chromosome}.panacus.time.log" params: app_path=config['app.path'], pan_name=config['name'], @@ -997,9 +991,9 @@ rule panacus_stats: rule vg_deconstruct: # Produce a VCF based on the "reference" haplotype input: - graph="output/{gtool}."+config['name']+".xg", + graph="output/Pan1c.{gtool}."+config['name']+".xg", output: - vcf=temp("output/{gtool}."+config['name']+".vcf"), + vcf=temp("output/Pan1c.{gtool}."+config['name']+".vcf"), threads: 8 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 32000 @@ -1007,8 +1001,8 @@ rule vg_deconstruct: app_path=config['app.path'], ref=config['reference'] log: - cmd="logs/vg_deconstruct/{gtool}.vg_deconstruct.cmd.log", - time="logs/vg_deconstruct/{gtool}.vg_deconstruct.time.log" + cmd="logs/vg_deconstruct/Pan1c.{gtool}.vg_deconstruct.cmd.log", + time="logs/vg_deconstruct/Pan1c.{gtool}.vg_deconstruct.time.log" shell: """ /usr/bin/time -v -o {log.time} \ @@ -1024,8 +1018,8 @@ rule vg_deconstruct: rule vcf_fig: # Produce a figure describing INS/DEL length distribution from vg deconstruct and SyRI input: - vg="output/{gtool}."+config['name']+".vcf.gz", - syris_mm2=expand("data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF) + vg="output/Pan1c.{gtool}."+config['name']+".vcf.gz", + syris_mm2=expand("data/asm.syri.mm2/Pan1c."+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF) output: vcf_fig=directory("output/{gtool}.vcf.figs") threads: 1 @@ -1077,7 +1071,7 @@ rule vcf_fig: rule vg_vcf_2_tsv: input: - "output/{gtool}."+config['name']+".vcf.gz" + "output/Pan1c.{gtool}."+config['name']+".vcf.gz" output: temp("tmp/var_json/vg_{gtool}.tsv") threads: 1 @@ -1090,7 +1084,7 @@ rule vg_vcf_2_tsv: rule syri_vcf_2_tsv: input: - expand("data/asm.syri.mm2/"+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF) + expand("data/asm.syri.mm2/Pan1c."+config['name']+".{haplotype}.syri.mm2.vcf.gz", haplotype=SAMPLES_NOREF) output: temp("tmp/var_json/syri_mm2.tsv") threads: 1 @@ -1138,7 +1132,7 @@ rule var_json: input: unpack(var_json_inputs) output: - json="output/report_data/"+config['name']+".var.json" + json="output/report_data/Pan1c."+config['name']+".var.json" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 48000 @@ -1155,12 +1149,12 @@ rule var_json: rule create_pan1c_report_fig: # Produces a markdown report figure of chromosomes graphs input: - graph='data/chrGraphs/{gtool}.'+config['name']+'.{chromosome}.tmp.gfa', - contigfig="output/chr.contig/{chromosome}.contig.png", + graph='data/chrGraphs/Pan1c.{gtool}.'+config['name']+'.{chromosome}.tmp.gfa', + contigfig="output/chr.contig/Pan1c."+config['name']+"{chromosome}.contig.png", output: - odgifig=temp("tmp/{gtool}.{chromosome}.odgi.png"), - namefig=temp("tmp/{gtool}.{chromosome}.name.png"), - reportfig="output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png" + odgifig=temp("tmp/Pan1c.{gtool}.{chromosome}.odgi.png"), + namefig=temp("tmp/Pan1c.{gtool}.{chromosome}.name.png"), + reportfig="output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png" threads: 4 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 2000 @@ -1198,14 +1192,14 @@ rule create_pan1c_report_fig: rule create_chrGraphs_figs: # Produce figures based on aggregated path stats input: - pathstats="output/stats/{gtool}."+config['name']+".chrGraph.path.stats.tsv" + pathstats="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv" output: barplots=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST), scatters=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST), heatmaps=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST), - barplot_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.mean.png", - scatter_mean="output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.mean.png", - heatmap_diff="output/chrGraphs.stats.figs/{gtool}."+config['name']+".shared.content.diff.png" + barplot_mean="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".path.decomp.mean.png", + scatter_mean="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".2D.scatter.mean.png", + heatmap_diff="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".shared.content.diff.png" threads: 1 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 16000 @@ -1232,26 +1226,26 @@ def get_report_sections(wildcards): """ sections = dict() - sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata" - sections["odgifigs"] = expand("output/report/{gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) + sections["metadata"] = "output/Pan1c."+config['name']+".gfa.metadata" + sections["odgifigs"] = expand("output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) sections["genstats"] = f"output/stats/{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv" sections["pathstats"] = f"output/stats/{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv" - sections["barplots"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) - sections["scatters"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) - sections["heatmaps"] = expand("output/chrGraphs.stats.figs/{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) + sections["barplots"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) + sections["scatters"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) + sections["heatmaps"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) sections["barplot_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png" sections["scatter_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png" sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".shared.content.diff.png" if config["get_ASMs_SyRI"] == "True": sections["SyRI_on_ASMs_figs"] = expand( - "output/asm.syri.figs/"+config['name']+".{haplotype}.syri.mm2.png", + "output/asm.syri.figs/Pan1c."+config['name']+".{haplotype}.syri_mm2.png", haplotype=SAMPLES_NOREF ) if config["get_chrInputs_SyRI"] == "True": sections["SyRI_on_chrInputs_figs"] = expand( - "output/chrInput.syri.figs/"+config['name']+".{chromosome}.syri.png", + "output/chrInput.syri.figs/Pan1c."+config['name']+".{chromosome}.syri_mm2.png", chromosome=CHRLIST ) @@ -1265,8 +1259,8 @@ rule create_pan1c_report: input: unpack(get_report_sections) output: - report="output/{gtool}."+config['name']+".report.md", - html="output/{gtool}."+config['name']+".report.html" + report="output/Pan1c.{gtool}."+config['name']+".report.md", + html="output/Pan1c.{gtool}."+config['name']+".report.html" threads: 4 resources: mem_mb = lambda wildcards, threads: threads * config["mem_multiplier"] * 500 -- GitLab From 230d9103fe5d6f4ae90a844ae887a2ccd160afd2 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 25 Oct 2024 14:48:27 +0200 Subject: [PATCH 2/5] Testing --- rules/tools.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rules/tools.smk b/rules/tools.smk index db1f566..490e74a 100644 --- a/rules/tools.smk +++ b/rules/tools.smk @@ -27,9 +27,9 @@ rule samtools_index: rule run_bgzip: # Run BGZIP on the file input: - "{file}" + "{file}.fa" output: - "{file}.gz" + "{file}.fa.gz" threads: 4 retries: 1 resources: -- GitLab From b23eea95d1796bc4e91631ff873951519181c79e Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 25 Oct 2024 14:50:58 +0200 Subject: [PATCH 3/5] Fixed missing Pan1c --- Snakefile | 6 +++--- scripts/graph.pan1c_QC.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Snakefile b/Snakefile index 12975d7..db81dc7 100644 --- a/Snakefile +++ b/Snakefile @@ -1194,9 +1194,9 @@ rule create_chrGraphs_figs: input: pathstats="output/stats/Pan1c.{gtool}."+config['name']+".chrGraph.path.stats.tsv" output: - barplots=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST), - scatters=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST), - heatmaps=expand("output/chrGraphs.stats.figs/{{gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST), + barplots=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST), + scatters=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST), + heatmaps=expand("output/chrGraphs.stats.figs/Pan1c.{{gtool}}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST), barplot_mean="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".path.decomp.mean.png", scatter_mean="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".2D.scatter.mean.png", heatmap_diff="output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".shared.content.diff.png" diff --git a/scripts/graph.pan1c_QC.py b/scripts/graph.pan1c_QC.py index 3c1bb07..8c41d82 100644 --- a/scripts/graph.pan1c_QC.py +++ b/scripts/graph.pan1c_QC.py @@ -2,7 +2,7 @@ Graph JSON creator for Pan1c-QC @author: alexis.mergez@inrae.fr -@version: 1.0 +@version: 1.1 """ import os @@ -99,6 +99,8 @@ for tsv in args.path: for query in chrdf["Query.name"].unique(): shared_table[gtool][chrid][query] = chrdf[chrdf["Query.name"] == query].drop(columns="Query.name").set_index("Target.name").to_dict(orient="index") + + ## Assembling output JSON Graph_JSON = { -- GitLab From 2658cabfd90535217bc26488dcb60883ae7408ce Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 25 Oct 2024 14:57:47 +0200 Subject: [PATCH 4/5] Found other missing 'Pan1c.' --- Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Snakefile b/Snakefile index db81dc7..6684e67 100644 --- a/Snakefile +++ b/Snakefile @@ -1228,14 +1228,14 @@ def get_report_sections(wildcards): sections["metadata"] = "output/Pan1c."+config['name']+".gfa.metadata" sections["odgifigs"] = expand("output/report/Pan1c.{gtool}."+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) - sections["genstats"] = f"output/stats/{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv" - sections["pathstats"] = f"output/stats/{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv" + sections["genstats"] = f"output/stats/Pan1c.{wildcards.gtool}."+config['name']+".chrGraph.general.stats.tsv" + sections["pathstats"] = f"output/stats/Pan1c.{wildcards.gtool}."+config['name']+".chrGraph.path.stats.tsv" sections["barplots"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".path.decomp.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) sections["scatters"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".2D.scatter.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) sections["heatmaps"] = expand("output/chrGraphs.stats.figs/Pan1c.{gtool}."+config['name']+".sharred.content.{chromosome}.png", chromosome=CHRLIST, gtool=[wildcards.gtool]) - sections["barplot_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".path.decomp.mean.png" - sections["scatter_mean"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".2D.scatter.mean.png" - sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/{wildcards.gtool}."+config['name']+".shared.content.diff.png" + sections["barplot_mean"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".path.decomp.mean.png" + sections["scatter_mean"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".2D.scatter.mean.png" + sections["heatmap_diff"] = f"output/chrGraphs.stats.figs/Pan1c.{wildcards.gtool}."+config['name']+".shared.content.diff.png" if config["get_ASMs_SyRI"] == "True": sections["SyRI_on_ASMs_figs"] = expand( -- GitLab From a2bb964af3aecb66402509e10348f01a07e2e1d3 Mon Sep 17 00:00:00 2001 From: Alexis Mergez <alexis.mergez@inrae.fr> Date: Fri, 25 Oct 2024 14:58:30 +0200 Subject: [PATCH 5/5] Undid modif to run_bgzip --- rules/tools.smk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rules/tools.smk b/rules/tools.smk index 490e74a..db1f566 100644 --- a/rules/tools.smk +++ b/rules/tools.smk @@ -27,9 +27,9 @@ rule samtools_index: rule run_bgzip: # Run BGZIP on the file input: - "{file}.fa" + "{file}" output: - "{file}.fa.gz" + "{file}.gz" threads: 4 retries: 1 resources: -- GitLab