Skip to content
Snippets Groups Projects
Commit b3944989 authored by thierry's avatar thierry
Browse files

begininng of parseReportFile function to get information of validation in diann log file

parent a4126644
No related branches found
No related tags found
No related merge requests found
...@@ -2,95 +2,143 @@ import re ...@@ -2,95 +2,143 @@ import re
from Bio import SeqIO from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis from Bio.SeqUtils.ProtParam import ProteinAnalysis
prot_descr = {} import argparse
import sys
import os
with open("/gorgone/pappso/moulon/database/uniprotkb_S_cerevisiae_S288C_20241125.fasta") as handle: def parseReportLog(reportlogFile):
for record in SeqIO.parse(handle, "fasta"): if os.path.exists(reportlogFile):
print(record.id) inputLog = open(reportlogFile, "r")
record_id = re.sub("^[a-zA-Z]+\|", "", record.id) fastafile = []
print(record_id) inputfile =[]
record_id = re.sub("\|[a-zA-Z0-9]*_[a-zA-Z0-9]*", "", record_id) for line in inputLog:
print(record_id) # print(line)
prot_descr[record_id] = record.id + " " + record.description if "--fasta" in line:
with open("/gorgone/pappso/moulon/database/contaminants_standarts.fasta") as handle: print(line)
for record in SeqIO.parse(handle, "fasta"): if match := re.search("(--f\ .*mzML|mzXML)", line):
prot_descr[record.id] = record.id + " " + record.description matchpart = match.group(1).split("--f")
inputfile = [x.strip() for x in matchpart if x != ""]
print(inputfile)
uniprot = {}
uniprotfile = open("uniprot.csv", "r")
for line in uniprotfile:
if "record_id" not in line:
line = line.replace('"', "")
tabline = line.strip().split("\t")
id = f"(UniMod:{tabline[0]})"
uniprot[id] = {"id":tabline[0], "title":tabline[1], "full_name":tabline[2], "mono_isotopique":tabline[3]}
uniprotfile.close()
pep_ids = open("pep_ids.tsv", "w")
pep_ids.write("Modified.sequence\tSequence\tmods\n")
file = open("report.tsv", "r")
samples = {}
for line in file:
if "File.Name" not in line:
tabline = line.split("\t")
sample = tabline[1]
previous_match = 0
mods = []
mods_string = []
for match in re.finditer(r"(\(UniMod\:[0-9]*\))", tabline[13]):
#print(f"{match.group()} : {match.span()[0]} {tabline[13]} {tabline[13][match.span()[0]-1]}, {match.span()[1]-match.span()[0]}")
amino_acid = tabline[13][match.span()[0]-1]
amino_acid_number = match.span()[0]-previous_match
#print(match.group())
#print(f'<mod aa="{amino_acid}" pos="{amino_acid_number}" mod="{uniprot[match.group()]["mono_isotopique"]}" />')
if match.group() == "(UniMod:4)":
mono_isotopique=57.0214614868
elif match.group() == "(UniMod:35)":
mono_isotopique=15.9949102402
elif match.group() == "(UniMod:1)":
mono_isotopique=42.0105667114
mods.append({"amino_acid":amino_acid, "amino_acid_number":amino_acid_number, "mono_isotopique":mono_isotopique})
previous_match += match.span()[1]-match.span()[0]
mods_string.append(f"{amino_acid}{amino_acid_number}:{mono_isotopique}")
pep_ids.write(f'{tabline[13]}\t{tabline[14]}\t{" ".join(mods_string)}\n')
if sample in samples.keys():
samples[sample].append({"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods})
else:
samples[sample] = [{"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods}]
pep_ids.close()
outputfile = open("outputfile.xml", "w") # #Defined command line
outputfile.write('<?xml version="1.0" encoding="utf-8" ?>\n') # desc = "Process Diann result to peptide result. \
outputfile.write('<peptide_result>\n') # The result can then be grouped by gp-grouping program."
outputfile.write('<filter evalue="0.01" />\n') # command = argparse.ArgumentParser(prog='gp-read-diann', \
for sample in samples: # description=desc, usage='%(prog)s [options] files')
outputfile.write(f'<sample name="{sample}" file="{sample}">\n') # command.add_argument('-q', '--Qvalue', default=0.01, type=float, \
scanid = 0 # help='Minimum peptide qvalue threshold (default:0.01)')
modlines = "" # command.add.argument('-p', '--PG.Qvalue', default=0.01, type=float, \
for peptiz in samples[sample]: # help='Minimum protein group qvalue threshold (default:0.01)')
scanid = scanid+1 # command.add_argument('-o', '--outfile', nargs="?", \
# print(f"scan {scanid} : {peptiz}") # type=argparse.FileType("w"), default=sys.stdout, \
current_pep = peptiz # help='Save peptide result default:STDOUT')
prot = current_pep["proteins.Ids"].split(";") # command.add_argument('-f', '--fasta', default="", type=str, \
pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight() # help='The Fasta file which is used for DIA-NN processing for multiple')
if len(current_pep['mods']) != 0: # command.add_argument('-c', '--contaminant_fasta', default="", type=str, \
for mod in current_pep['mods']: # help='The Contaminant Fasta file which is used for DIA-NN processing')
pep_mass += float(mod["mono_isotopique"]) # command.add_argument('files', metavar='files', nargs='+', \
outputfile.write(f"<scan num=\"{scanid}\" z=\"{current_pep['charge']}\" mhObs=\"{pep_mass}\">\n") # help='List of X!Tandem files to process')
for i in range(len(prot)): # command.add_argument('-v', '--version', action='version', \
if len(current_pep['mods']) == 0: # version='%(prog)s ${GP_VERSION}')
pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight() #
outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\"></psm>\n") #
else: #
outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\">\n") # prot_descr = {}
for mod in current_pep['mods']: #
outputfile.write(f'<mod aa="{mod["amino_acid"]}" pos="{mod["amino_acid_number"]}" mod="{mod["mono_isotopique"]}" />\n') # with open("/gorgone/pappso/moulon/database/uniprotkb_S_cerevisiae_S288C_20241125.fasta") as handle:
outputfile.write("</psm>\n") # for record in SeqIO.parse(handle, "fasta"):
outputfile.write("</scan>\n") # print(record.id)
outputfile.write("</sample>\n") # record_id = re.sub("^[a-zA-Z]+\|", "", record.id)
outputfile.write("</peptide_result>\n") # print(record_id)
outputfile.close() # record_id = re.sub("\|[a-zA-Z0-9]*_[a-zA-Z0-9]*", "", record_id)
# print(record_id)
# prot_descr[record_id] = record.id + " " + record.description
# with open("/gorgone/pappso/moulon/database/contaminants_standarts.fasta") as handle:
# for record in SeqIO.parse(handle, "fasta"):
# prot_descr[record.id] = record.id + " " + record.description
#
#
#
# uniprot = {}
# uniprotfile = open("uniprot.csv", "r")
# for line in uniprotfile:
# if "record_id" not in line:
# line = line.replace('"', "")
# tabline = line.strip().split("\t")
# id = f"(UniMod:{tabline[0]})"
# uniprot[id] = {"id":tabline[0], "title":tabline[1], "full_name":tabline[2], "mono_isotopique":tabline[3]}
# uniprotfile.close()
#
# pep_ids = open("pep_ids.tsv", "w")
# pep_ids.write("Modified.sequence\tSequence\tmods\n")
#
# file = open("report.tsv", "r")
# samples = {}
# for line in file:
# if "File.Name" not in line:
# tabline = line.split("\t")
# sample = tabline[1]
# previous_match = 0
# mods = []
# mods_string = []
# for match in re.finditer(r"(\(UniMod\:[0-9]*\))", tabline[13]):
# #print(f"{match.group()} : {match.span()[0]} {tabline[13]} {tabline[13][match.span()[0]-1]}, {match.span()[1]-match.span()[0]}")
# amino_acid = tabline[13][match.span()[0]-1]
# amino_acid_number = match.span()[0]-previous_match
# #print(match.group())
# #print(f'<mod aa="{amino_acid}" pos="{amino_acid_number}" mod="{uniprot[match.group()]["mono_isotopique"]}" />')
# if match.group() == "(UniMod:4)":
# mono_isotopique=57.0214614868
# elif match.group() == "(UniMod:35)":
# mono_isotopique=15.9949102402
# elif match.group() == "(UniMod:1)":
# mono_isotopique=42.0105667114
# mods.append({"amino_acid":amino_acid, "amino_acid_number":amino_acid_number, "mono_isotopique":mono_isotopique})
# previous_match += match.span()[1]-match.span()[0]
# mods_string.append(f"{amino_acid}{amino_acid_number}:{mono_isotopique}")
# pep_ids.write(f'{tabline[13]}\t{tabline[14]}\t{" ".join(mods_string)}\n')
# if sample in samples.keys():
# samples[sample].append({"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods})
# else:
# samples[sample] = [{"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods}]
#
# pep_ids.close()
#
# outputfile = open("outputfile.xml", "w")
# outputfile.write('<?xml version="1.0" encoding="utf-8" ?>\n')
# outputfile.write('<peptide_result>\n')
# outputfile.write('<filter evalue="0.01" />\n')
# for sample in samples:
# outputfile.write(f'<sample name="{sample}" file="{sample}">\n')
# scanid = 0
# modlines = ""
# for peptiz in samples[sample]:
# scanid = scanid+1
# # print(f"scan {scanid} : {peptiz}")
# current_pep = peptiz
# prot = current_pep["proteins.Ids"].split(";")
# pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
# if len(current_pep['mods']) != 0:
# for mod in current_pep['mods']:
# pep_mass += float(mod["mono_isotopique"])
# outputfile.write(f"<scan num=\"{scanid}\" z=\"{current_pep['charge']}\" mhObs=\"{pep_mass}\">\n")
# for i in range(len(prot)):
# if len(current_pep['mods']) == 0:
# pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
# outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\"></psm>\n")
# else:
# outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\">\n")
# for mod in current_pep['mods']:
# outputfile.write(f'<mod aa="{mod["amino_acid"]}" pos="{mod["amino_acid_number"]}" mod="{mod["mono_isotopique"]}" />\n')
# outputfile.write("</psm>\n")
# outputfile.write("</scan>\n")
# outputfile.write("</sample>\n")
# outputfile.write("</peptide_result>\n")
# outputfile.close()
parseReportLog("/gorgone/pappso/moulon/users/thierry/20240329_test_astral/diann/col-O_DIA_180SPD/report.log.txt")
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment