#########################################################################
# MacSyFinder - Detection of macromolecular systems in protein dataset #
# using systems modelling and similarity search. #
# Authors: Sophie Abby, Bertrand Neron #
# Copyright (c) 2014-2024 Institut Pasteur (Paris) and CNRS. #
# See the COPYRIGHT file for details #
# #
# This file is part of MacSyFinder package. #
# #
# MacSyFinder is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# MacSyFinder is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details . #
# #
# You should have received a copy of the GNU General Public License #
# along with MacSyFinder (COPYING). #
# If not, see <https://www.gnu.org/licenses/>. #
#########################################################################
"""
This module focus on the way to serialize the different macsyfinder modules
"""
import abc
import typing
from string import Template
from .gene import GeneStatus
from .system import System, RejectedCandidate, LikelySystem, UnlikelySystem, HitSystemTracker
from .solution import Solution
from .hit import Loner, MultiSystem
[docs]
class SystemSerializer(metaclass=abc.ABCMeta):
"""
handle the different way to serialize a system
"""
@abc.abstractmethod
def serialize(self, system: System, hit_system_tracker: HitSystemTracker):
pass
[docs]
class TxtSystemSerializer(SystemSerializer):
"""
Handle System serialization in text
"""
[docs]
def serialize(self, system: System, hit_system_tracker: HitSystemTracker) -> str:
"""
:return: a string representation of system readable by human
"""
clst = ", ".join(["[" + ", ".join([str((v_h.id, v_h.gene.name, v_h.position)) for v_h in cluster.hits]) + "]"
for cluster in system.clusters])
s = f"""system id = {system.id}
model = {system.model.fqn}
replicon = {system.replicon_name}
clusters = {clst}
occ = {system.occurrence()}
wholeness = {system.wholeness:.3f}
loci nb = {system.loci_nb}
score = {system.score:.3f}
"""
for title, genes in (("mandatory", system.mandatory_occ),
("accessory", system.accessory_occ),
("neutral", system.neutral_occ)):
s += f"\n{title} genes:\n"
for g_name, hits in genes.items():
s += f"\t- {g_name}: {len(hits)} "
all_hits_str = []
for h in hits:
used_in_systems = [s.id for s in hit_system_tracker[h.hit]
if s.model.fqn != system.model.fqn]
used_in_systems.sort()
if used_in_systems:
hit_str = f"{h.gene.name} [{', '.join(used_in_systems)}]"
else:
hit_str = f"{h.gene.name}"
all_hits_str.append(hit_str)
s += f'({", ".join(all_hits_str)})\n'
return s
[docs]
class TsvSystemSerializer(SystemSerializer):
"""
Handle System serialization in tsv format
"""
header = "replicon\thit_id\tgene_name\thit_pos\tmodel_fqn" \
"\tsys_id\tsys_loci\tlocus_num\tsys_wholeness\tsys_score\tsys_occ" \
"\thit_gene_ref\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov\thit_seq_cov\t" \
"hit_begin_match\thit_end_match\tcounterpart\tused_in"
template = Template("$sys_replicon_name\t$mh_id\t$mh_gene_name\t$mh_position\t$sys_model_fqn\t"
"$sys_id\t$sys_loci\t$locus_num\t$sys_wholeness\t$sys_score\t"
"$sys_occurrence\t$mh_gene_role\t$mh_status\t$mh_seq_length\t$mh_i_eval\t"
"$mh_score\t$mh_profile_coverage\t$mh_sequence_coverage\t$mh_begin_match"
"\t$mh_end_match\t$mh_counterpart\t$used_in_systems\n")
[docs]
def serialize(self, system: System, hit_system_tracker: HitSystemTracker) -> str:
r"""
:param :class:`macsypy.system.System` system: The system to serialize.
:param hit_system_tracker: The hit_system_tracker which allow to know for each hit
in which system it is implied.
:type hit_system_tracker: :class:`macsypy.system.HitSystemTracker` object
:return: a serialisation of this system in tabulated separated value format
each line represent a hit and have the following structure:
.. code-block:: text
replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_loci\tlocus_num\tsys_wholeness\tsys_score
\tsys_occ\thit_gene_ref.alternate_of\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov
\thit_seq_cov\tit_begin_match\thit_end_match\tcounterpart\tused_in_systems
:rtype: str
"""
tsv = ''
loci_num = system.loci_num
for locus_num, cluster in zip(loci_num, system.clusters):
for mh in sorted(cluster.hits, key=lambda mh: mh.position):
used_in_systems = [s.id for s in hit_system_tracker[mh.hit] if s.model.fqn != system.model.fqn]
used_in_systems.sort()
tsv += self.template.substitute(
sys_replicon_name=system.replicon_name,
mh_id=mh.id,
mh_gene_name=mh.gene.name,
mh_position=mh.position,
sys_model_fqn=system.model.fqn,
sys_id=system.id,
sys_loci=system.loci_nb,
locus_num=locus_num,
sys_wholeness=f"{system.wholeness:.3f}",
sys_score=f"{system.score:.3f}",
sys_occurrence=system.occurrence(),
mh_gene_role=mh.gene_ref.alternate_of().name,
mh_status=mh.status,
mh_seq_length=mh.seq_length,
mh_i_eval=mh.i_eval,
mh_score=f"{mh.score:.3f}",
mh_profile_coverage=f"{mh.profile_coverage:.3f}",
mh_sequence_coverage=f"{mh.sequence_coverage:.3f}",
mh_begin_match=mh.begin_match,
mh_end_match=mh.end_match,
mh_counterpart=','.join([h.id for h in mh.counterpart]),
used_in_systems=','.join(used_in_systems)
)
return tsv
[docs]
class TsvSolutionSerializer:
"""
Handle Solution (list of Systems) serialization in tsv format
"""
header = 'sol_id\t' + TsvSystemSerializer.header
template = Template(f"$$sol_id\t{TsvSystemSerializer.template.template}")
[docs]
def serialize(self, solution: Solution, sol_id: int, hit_system_tracker: HitSystemTracker) -> str:
"""
:param solution: the solution to serialize
:param sol_id: the solution identifier
:param hit_system_tracker:
:return: a serialisation of this solution (a list of systems) in tabulated separated value format
each line represent a hit and have the same structure as system serialization
:meth:`macsypy.serialization.TsvSystemSerializer.serialize` but with an extra column
sol_id which is a technical id to identify the different solutions.
"""
tsv = ''
sys_ser = TsvSystemSerializer()
sys_ser.template = self.template
for system in solution:
sol_temp = Template(sys_ser.serialize(system, hit_system_tracker))
tsv += f"{sol_temp.substitute(sol_id=sol_id)}\n"
return tsv
[docs]
class TxtLikelySystemSerializer(SystemSerializer):
"""
Handle System serialization in text
"""
[docs]
def serialize(self, system: LikelySystem, hit_system_tracker: HitSystemTracker):
"""
:param system: The likely system to serialize. Used only for unordered db-type
:param hit_system_tracker: The hit_system_tracker which allow to know for each hit
in which system it is implied.
:return: a string representation of system readable by human
"""
hits = ", ".join([str((h.id, h.gene.name, h.position)) for h in system.hits])
if system.forbidden_hits:
warning = "WARNING there quorum is reached but there is also some forbidden genes.\n"
else:
warning = '\n'
s = f"""This replicon contains genetic materials needed for system {system.model.fqn}
{warning}
system id = {system.id}
model = {system.model.fqn}
replicon = {system.replicon_name}
hits = [{hits}]
wholeness = {system.wholeness:.3f}
"""
for title, genes in (("mandatory", system.mandatory_occ),
("accessory", system.accessory_occ),
("neutral", system.neutral_occ),
("forbidden", system.forbidden_occ)):
s += f"\n{title} genes:\n"
for g_name, hits in genes.items():
s += f"\t- {g_name}: {len(hits)} "
all_hits_str = []
for h in hits:
used_in_systems = [s.id for s in hit_system_tracker[h.hit]
if s.model.fqn != system.model.fqn]
used_in_systems.sort()
if used_in_systems:
hit_str = f"{h.gene.name} [{', '.join(used_in_systems)}]"
else:
hit_str = f"{h.gene.name}"
all_hits_str.append(hit_str)
s += f'({", ".join(all_hits_str)})\n'
s += "\nUse ordered replicon to have better prediction.\n"
return s
[docs]
class TsvLikelySystemSerializer(SystemSerializer):
"""
Handle potential System from unordered replicon
serialization in tsv format
"""
header = "replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_wholeness" \
"\thit_gene_ref\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov\thit_seq_cov\t" \
"hit_begin_match\thit_end_match\tused_in"
template = Template("$sys_replicon_name\t$mh_id\t$mh_gene_name\t$mh_position\t$sys_model_fqn\t"
"$sys_id\t$sys_wholeness\t"
"$mh_gene_role\t$mh_status\t$mh_seq_length\t$mh_i_eval\t"
"$mh_score\t$mh_profile_coverage\t$mh_sequence_coverage\t$mh_begin_match"
"\t$mh_end_match\t$used_in_systems\n")
[docs]
def serialize(self, system: LikelySystem, hit_system_tracker: HitSystemTracker) -> str:
r"""
:param system: The likely system to serialize. Used only for unordered db-type
:param hit_system_tracker: The hit_system_tracker which allow to know for each hit
in which system it is implied.
:return: a serialisation of this system in tabulated separated value format
each line represent a hit and have the following structure:
.. code-block:: text
replicon\thit_id\tgene_name\thit_pos\tmodel_fqn\tsys_id\tsys_wholeness
\thit_gene_ref.alternate_of\thit_status\thit_seq_len\thit_i_eval\thit_score\thit_profile_cov
\thit_seq_cov\tit_begin_match\thit_end_match\t$used_in_systems
:rtype: str
"""
tsv = ''
for status in (s.lower() for s in GeneStatus.__members__):
try:
hits = getattr(system, f"{status}_hits")
hits = sorted(hits, key=lambda mh: mh.gene.name)
except AttributeError:
continue
for mh in hits:
used_in_systems = [s.id for s in hit_system_tracker[mh.hit] if s.model.fqn != system.model.fqn]
used_in_systems.sort()
tsv += self.template.substitute(
sys_replicon_name=system.replicon_name,
mh_id=mh.id,
mh_gene_name=mh.gene.name,
mh_position=mh.position,
sys_model_fqn=system.model.fqn,
sys_id=system.id,
sys_wholeness=f"{system.wholeness:.3f}",
mh_gene_role=mh.gene_ref.alternate_of().name,
mh_status=mh.status,
mh_seq_length=mh.seq_length,
mh_i_eval=mh.i_eval,
mh_score=f"{mh.score:.3f}",
mh_profile_coverage=f"{mh.profile_coverage:.3f}",
mh_sequence_coverage=f"{mh.sequence_coverage:.3f}",
mh_begin_match=mh.begin_match,
mh_end_match=mh.end_match,
used_in_systems=','.join(used_in_systems)
)
return tsv
[docs]
class TxtUnikelySystemSerializer(SystemSerializer):
"""
Handle System serialization in text
"""
[docs]
def serialize(self, system: UnlikelySystem) -> str:
"""
:param system: The unlikely system to serialize. (used only if db-type is "unordered_replicon")
:return: a string representation of system readable by human
"""
hits = ", ".join([str((h.id, h.gene.name, h.position)) for h in system.hits])
reasons = '\n'.join(system.reasons)
s = f"""This replicon probably not contains a system {system.model.fqn}:
{reasons}
system id = {system.id}
model = {system.model.fqn}
replicon = {system.replicon_name}
hits = [{hits}]
wholeness = {system.wholeness:.3f}
"""
for title, genes in (("mandatory", system.mandatory_occ),
("accessory", system.accessory_occ),
("neutral", system.neutral_occ),
("forbidden", system.forbidden_occ)):
s += f"\n{title} genes:\n"
for g_name, hits in genes.items():
s += f"\t- {g_name}: {len(hits)} "
all_hits_str = [f"{h.gene.name}" for h in hits]
s += f'({", ".join(all_hits_str)})\n'
s += "\nUse ordered replicon to have better prediction.\n"
return s
[docs]
class TsvSpecialHitSerializer:
"""
Serialize special hits: :class:`macsypy.hit.Loner` and :class:`macsypy.hit.MultiSystem` in tsv format
"""
[docs]
def serialize(self, best_hits: typing.Iterable[Loner] | typing.Iterable[MultiSystem]):
"""
:param best_hits: the special hits to serialized
:type best_hits: sequence of :class:`macsypy.hit.Loner` or :class:`macsypy.hit.MultiSystem` objects
"""
s = ""
if best_hits:
header = "replicon\tmodel_fqn\tfunction\tgene_name\t" \
"hit_id\thit_pos\thit_status\thit_seq_len\t" \
"hit_i_eval\thit_score\thit_profile_cov\t" \
"hit_seq_cov\thit_begin_match\thit_end_match\n"
s += header
special_hits = set(best_hits)
for best_hit in best_hits:
special_hits.update(best_hit.counterpart)
special_hits = list(special_hits)
special_hits.sort(key=lambda h: h.position)
for one_hit in special_hits:
row = f"{one_hit.replicon_name}\t{one_hit.gene_ref.model.fqn}\t{one_hit.gene_ref.alternate_of().name}\t" \
f"{one_hit.gene_ref.name}\t{one_hit.id}\t{one_hit.position:d}\t{one_hit.status}\t" \
f"{one_hit.seq_length:d}\t{one_hit.i_eval:.3e}\t{one_hit.score:.3f}\t" \
f"{one_hit.profile_coverage:.3f}\t{one_hit.sequence_coverage:.3f}\t" \
f"{one_hit.begin_match:d}\t{one_hit.end_match:d}\n"
s += row
return s
[docs]
class TsvRejectedCandidatesSerializer:
"""
Serialize Rejected Cluster in tsv format
"""
[docs]
def serialize(self, candidates: list[RejectedCandidate]) -> str:
"""
:param candidates: list of rejected candidates to serialize
"""
s = ""
if candidates:
header = "candidate_id\treplicon\tmodel_fqn\tcluster_id\thit_id\thit_pos\tgene_name\tfunction\treasons\n"
s += header
for candidate in candidates:
reasons = '/'.join(candidate.reasons)
for cluster in candidate.clusters:
for hit in cluster.hits:
row = f"{candidate.id}\t{candidate.replicon_name}\t{candidate.model.fqn}\t" \
f"{cluster.id}\t{hit.id}\t{hit.position}\t{hit.gene_ref.name}" \
f"\t{hit.gene_ref.alternate_of().name}\t" \
f"{reasons}\n"
s += row
s += '\n'
return s