Skip to content
Snippets Groups Projects
Commit a4126644 authored by Langella Olivier's avatar Langella Olivier
Browse files

Merge branch 'main' of forgemia.inra.fr:pappso/groupingprotein

parents c33e8d4f 0566317f
No related branches found
No related tags found
No related merge requests found
import re
from Bio import SeqIO
from Bio.SeqUtils.ProtParam import ProteinAnalysis
prot_descr = {}
with open("/gorgone/pappso/moulon/database/uniprotkb_S_cerevisiae_S288C_20241125.fasta") as handle:
for record in SeqIO.parse(handle, "fasta"):
print(record.id)
record_id = re.sub("^[a-zA-Z]+\|", "", record.id)
print(record_id)
record_id = re.sub("\|[a-zA-Z0-9]*_[a-zA-Z0-9]*", "", record_id)
print(record_id)
prot_descr[record_id] = record.id + " " + record.description
with open("/gorgone/pappso/moulon/database/contaminants_standarts.fasta") as handle:
for record in SeqIO.parse(handle, "fasta"):
prot_descr[record.id] = record.id + " " + record.description
uniprot = {}
uniprotfile = open("uniprot.csv", "r")
for line in uniprotfile:
if "record_id" not in line:
line = line.replace('"', "")
tabline = line.strip().split("\t")
id = f"(UniMod:{tabline[0]})"
uniprot[id] = {"id":tabline[0], "title":tabline[1], "full_name":tabline[2], "mono_isotopique":tabline[3]}
uniprotfile.close()
pep_ids = open("pep_ids.tsv", "w")
pep_ids.write("Modified.sequence\tSequence\tmods\n")
file = open("report.tsv", "r")
samples = {}
for line in file:
if "File.Name" not in line:
tabline = line.split("\t")
sample = tabline[1]
previous_match = 0
mods = []
mods_string = []
for match in re.finditer(r"(\(UniMod\:[0-9]*\))", tabline[13]):
#print(f"{match.group()} : {match.span()[0]} {tabline[13]} {tabline[13][match.span()[0]-1]}, {match.span()[1]-match.span()[0]}")
amino_acid = tabline[13][match.span()[0]-1]
amino_acid_number = match.span()[0]-previous_match
#print(match.group())
#print(f'<mod aa="{amino_acid}" pos="{amino_acid_number}" mod="{uniprot[match.group()]["mono_isotopique"]}" />')
if match.group() == "(UniMod:4)":
mono_isotopique=57.0214614868
elif match.group() == "(UniMod:35)":
mono_isotopique=15.9949102402
elif match.group() == "(UniMod:1)":
mono_isotopique=42.0105667114
mods.append({"amino_acid":amino_acid, "amino_acid_number":amino_acid_number, "mono_isotopique":mono_isotopique})
previous_match += match.span()[1]-match.span()[0]
mods_string.append(f"{amino_acid}{amino_acid_number}:{mono_isotopique}")
pep_ids.write(f'{tabline[13]}\t{tabline[14]}\t{" ".join(mods_string)}\n')
if sample in samples.keys():
samples[sample].append({"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods})
else:
samples[sample] = [{"proteins.Ids":tabline[3], "proteins.names":tabline[4], "sequence_mods":tabline[13], "sequence":tabline[14], "charge":tabline[16], "Q.value":tabline[17], "mods":mods}]
pep_ids.close()
outputfile = open("outputfile.xml", "w")
outputfile.write('<?xml version="1.0" encoding="utf-8" ?>\n')
outputfile.write('<peptide_result>\n')
outputfile.write('<filter evalue="0.01" />\n')
for sample in samples:
outputfile.write(f'<sample name="{sample}" file="{sample}">\n')
scanid = 0
modlines = ""
for peptiz in samples[sample]:
scanid = scanid+1
# print(f"scan {scanid} : {peptiz}")
current_pep = peptiz
prot = current_pep["proteins.Ids"].split(";")
pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
if len(current_pep['mods']) != 0:
for mod in current_pep['mods']:
pep_mass += float(mod["mono_isotopique"])
outputfile.write(f"<scan num=\"{scanid}\" z=\"{current_pep['charge']}\" mhObs=\"{pep_mass}\">\n")
for i in range(len(prot)):
if len(current_pep['mods']) == 0:
pep_mass = ProteinAnalysis(current_pep['sequence']).molecular_weight()
outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\"></psm>\n")
else:
outputfile.write(f"<psm seq=\"{current_pep['sequence']}\" mhTheo=\"{pep_mass}\" evalue=\"{current_pep['Q.value']}\" prot=\"{prot_descr[prot[i]]}\">\n")
for mod in current_pep['mods']:
outputfile.write(f'<mod aa="{mod["amino_acid"]}" pos="{mod["amino_acid_number"]}" mod="{mod["mono_isotopique"]}" />\n')
outputfile.write("</psm>\n")
outputfile.write("</scan>\n")
outputfile.write("</sample>\n")
outputfile.write("</peptide_result>\n")
outputfile.close()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment