Source code for bionetgen.atomizer.atomizer.moleculeCreation

# -*- coding: utf-8 -*-
"""
Created on Tue Apr  2 21:06:43 2013

@author: proto
"""

# -*- coding: utf-8 -*-
"""
Created on Tue Dec  6 17:42:31 2011

@author: proto
"""
from copy import deepcopy, copy
from . import analyzeSBML
import bionetgen.atomizer.utils.structures as st
from bionetgen.atomizer.utils.util import logMess
import re

# import biogrid
import marshal
import functools
import bionetgen.atomizer.utils.pathwaycommons as pwcm
from collections import Counter, defaultdict
import itertools
from .atomizerUtils import BindingException
from . import resolveSCT
from . import atomizationAux as atoAux


[docs]def isInComplexWith(moleculeSet, parser=None):
    """
    given a list of binding candidates, it gets the uniprot ID from annotation and queries
    the pathway commons class to see if there's known binding information for those two
    """
    validPairs = []
    for element in moleculeSet:
        # if element[0] == element[1]:
        #    return []

        name1 = atoAux.getURIFromSBML(element[0], parser, ["uniprot", "go"])
        name2 = atoAux.getURIFromSBML(element[1], parser, ["uniprot", "go"])

        modelAnnotation = parser.extractModelAnnotation()
        modelOrganism = (
            modelAnnotation["BQB_OCCURS_IN"]
            if "BQB_OCCURS_IN" in modelAnnotation
            else None
        )
        molecule1 = name1[0].split("/")[-1] if name1 else element[0]
        molecule2 = name2[0].split("/")[-1] if name2 else element[1]

        simpleOrganism = (
            [x.split("/")[-1] for x in modelOrganism] if modelOrganism else None
        )

        bindingResults = pwcm.queryBioGridByName(
            molecule1, molecule2, simpleOrganism, element[0], element[1]
        )
        if not bindingResults:
            bindingResults = pwcm.queryBioGridByName(
                element[0], element[1], None, None, None
            )
        # bindingResults = None
        # bindingResults = pwcm.isInComplexWith([element[0], name1], [element[1], name2], organism=modelOrganism)
        if bindingResults:
            validPairs.append(element)
    # use pathway commosn as fallback since its much slower
    # it is stupid slow. Enable it as a not on default option
    if not validPairs:
        for element in moleculeSet:
            # if element[0] == element[1]:
            #    return []

            name1 = atoAux.getURIFromSBML(element[0], parser, ["uniprot", "go"])
            name2 = atoAux.getURIFromSBML(element[1], parser, ["uniprot", "go"])
            modelAnnotation = parser.extractModelAnnotation()
            modelOrganism = (
                modelAnnotation["BQB_OCCURS_IN"]
                if "BQB_OCCURS_IN" in modelAnnotation
                else None
            )
            bindingResults = pwcm.isInComplexWith(
                [element[0], name1], [element[1], name2], organism=modelOrganism
            )
            if bindingResults:
                validPairs.append(element)
    return validPairs


[docs]def identifyReaction(equivalenceDictionary, originalElement, modifiedElement):
    for classification in equivalenceDictionary:
        if set([originalElement, modifiedElement]) in [
            set(x) for x in equivalenceDictionary[classification]
        ]:
            return classification
    return None


[docs]def createEmptySpecies(name):
    species = st.Species()
    molecule = st.Molecule(name)
    species.addMolecule(molecule)
    return species


[docs]def addStateToComponent(species, moleculeName, componentName, state):
    for molecule in species.molecules:
        if moleculeName == molecule.name:
            for component in molecule.components:
                if componentName == component.name:
                    tmp = component.activeState
                    if state not in component.states:
                        component.addState(state)
                    elif state in component.states:
                        component.setActiveState(state)
                    return tmp


[docs]def addComponentToMolecule(species, moleculeName, componentName):
    for molecule in species.molecules:
        if moleculeName == molecule.name:
            if componentName not in [x.name for x in molecule.components]:
                component = st.Component(componentName)
                molecule.addComponent(component)
                return True
    return False


[docs]def addBondToComponent(species, moleculeName, componentName, bond, priority=1):
    order = 1
    for molecule in species.molecules:
        if moleculeName == molecule.name:
            if (
                priority == order
                or len([x for x in species.molecules if x.name == moleculeName])
                == order
            ):
                for component in molecule.components:
                    if componentName == component.name:
                        # if we are adding a second bond to the same component
                        # it actually means that we have two components with the
                        # same name
                        if len(component.bonds) == 0:
                            component.addBond(bond)
                            return
                        else:
                            newComponent = st.Component(componentName)
                            newComponent.addBond(bond)
                            molecule.addComponent(newComponent)
                            return
            else:
                order += 1


[docs]def solveComplexBinding(totalComplex, pathwaycommonsFlag, parser, compositionEntry):
    """
    given two binding complexes it will attempt to find the ways in which they bind using different criteria

    """

    def sortMolecules(array, reverse):
        return sorted(
            array,
            key=lambda molecule: (
                len(molecule.components),
                len([x for x in molecule.components if x.activeState not in [0, "0"]]),
                len(str(molecule)),
                str(molecule),
            ),
            reverse=reverse,
        )

    def getBiggestMolecule(array):
        sortedMolecule = sortMolecules(array, reverse=False)

        # sortedMolecule = sorted(sortedMolecule, key=lambda rule: len(rule.components))

        return sortedMolecule[-1]

    def getNamedMolecule(array, name):
        for molecule in sortMolecules(array, True):
            if molecule.name == name:
                return molecule
            elif molecule.trueName == name:
                return molecule

    names1 = [str(x.trueName) for x in totalComplex[0]]
    names2 = [str(x.trueName) for x in totalComplex[1]]
    bioGridDict = {}
    # find all pairs of molecules
    comb = set([tuple(sorted([x, y])) for x in names1 for y in names2])
    comb2 = set(
        [tuple(sorted([x, y])) for x in compositionEntry for y in compositionEntry]
    )

    dbPair = set([])
    combTemp = set()

    # search pathway commons for binding candidates
    if pathwaycommonsFlag:
        dbPair = isInComplexWith(comb, parser)
    else:
        for element in comb:
            if (
                element[0].upper() in bioGridDict
                and element[1] in bioGridDict[element[0].upper()]
                or element[1].upper() in bioGridDict
                and element[0] in bioGridDict[element[1].upper()]
            ):
                # logMess('INFO:ATO001', 'Biogrid info: {0}:{1}'.format(element[0], element[1]))
                dbPair.add((element[0], element[1]))
        # elif pathwaycommonsFlag:
        #    if pwcm.isInComplexWith(element[0], element[1]):
        #        dbPair.add((element[0], element[1]))
    dbPair = list(dbPair)

    if dbPair != []:
        mol1 = mol2 = None
        # select the best candidate if there's many ways to bind (in general
        # one that doesn't overlap with an already exising pair)
        finalDBpair = []
        if len(dbPair) > 1:
            for element in dbPair:
                mset1 = Counter(element)
                mset2 = Counter(names1)
                mset3 = Counter(names2)
                intersection1 = mset1 & mset2
                intersection2 = mset1 & mset3
                intersection1 = list(intersection1.elements())
                intersection2 = list(intersection2.elements())
                if len(intersection1) < 2 and len(intersection2) < 2:
                    finalDBpair.append(element)
        if len(finalDBpair) > 0:
            dbPair = finalDBpair

        if len(dbPair) > 1:
            # @FIXME: getNamedMolecule should never receive parameters that cause it to return null, but somehow that's what is happening
            # when you receive a malformed user definition file. The error
            # should be caught way before we reach this point
            tmpComplexSubset1 = [
                getNamedMolecule(totalComplex[0], element[0])
                for element in dbPair
                if getNamedMolecule(totalComplex[0], element[0]) is not None
            ]
            if not tmpComplexSubset1:
                tmpComplexSubset1 = [
                    getNamedMolecule(totalComplex[0], element[1])
                    for element in dbPair
                    if getNamedMolecule(totalComplex[0], element[1]) is not None
                ]
                tmpComplexSubset2 = [
                    getNamedMolecule(totalComplex[1], element[0])
                    for element in dbPair
                    if getNamedMolecule(totalComplex[1], element[0]) is not None
                ]
            else:
                tmpComplexSubset2 = [
                    getNamedMolecule(totalComplex[1], element[1])
                    for element in dbPair
                    if getNamedMolecule(totalComplex[1], element[1]) is not None
                ]

            mol1 = getBiggestMolecule(tmpComplexSubset1)
            mol2 = getBiggestMolecule(tmpComplexSubset2)
            # was ATO002
            logMess(
                "WARNING:ATO111",
                "{0}-{1}:The two pairs can bind in these ways according to BioGrid/Pathwaycommons:{2}:Defaulting to:('{3}', '{4}')".format(
                    names1, names2, dbPair, mol1.name, mol2.name
                ),
            )

        else:
            mol1 = getNamedMolecule(totalComplex[0], dbPair[0][0])
            if not mol1:
                mol1 = getNamedMolecule(totalComplex[1], dbPair[0][0])
                mol2 = getNamedMolecule(totalComplex[0], dbPair[0][1])

            else:
                mol2 = getNamedMolecule(totalComplex[1], dbPair[0][1])
                if not mol2:
                    mol1 = getNamedMolecule(totalComplex[1], dbPair[0][0])
                    mol2 = getNamedMolecule(totalComplex[0], dbPair[0][1])

            logMess(
                "INFO:ATO001",
                "Binding information found in BioGrid/Pathwaycommons for for {0}-{1}".format(
                    mol1.name, mol2.name
                ),
            )

    else:
        # mol1 = getBiggestMolecule(totalComplex[0])
        # mol2 = getBiggestMolecule(totalComplex[1])
        """
        if pathwaycommonsFlag:
            logMess('ERROR:ATO201', "We don't know how {0} and {1} bind together and there's no relevant BioGrid/Pathway commons information. Not atomizing".format(
                [x.name for x in totalComplex[0]], [x.name for x in totalComplex[1]]))
            # addAssumptions('unknownBond',(mol1.name,mol2.name))
        else:

            logMess('ERROR:ATO202', "We don't know how {0} and {1} bind together. Not atomizing".format(
                [x.name for x in totalComplex[0]], [x.name for x in totalComplex[1]]))
            # addAssumptions('unknownBond',(mol1.name,mol2.name))
        """
        raise BindingException(
            "{0}-{1}".format(
                sorted([x.name for x in totalComplex[0]]),
                sorted([x.name for x in totalComplex[1]]),
            ),
            comb,
        )

    return mol1, mol2


[docs]def getComplexationComponents2(
    moleculeName,
    species,
    bioGridFlag,
    pathwaycommonsFlag=False,
    parser=None,
    bondSeeding=[],
    bondExclusion=[],
    database=None,
):
    """
    method used during the atomization process. It determines how molecules
    in a species bind together
    """

    def sortMolecules(array, reverse):
        return sorted(
            array,
            key=lambda molecule: (
                len(molecule.components),
                len([x for x in molecule.components if x.activeState not in [0, "0"]]),
                len(str(molecule)),
                str(molecule),
            ),
            reverse=reverse,
        )

    def getBiggestMolecule(array):
        sortedMolecule = sortMolecules(array, reverse=False)

        # sortedMolecule = sorted(sortedMolecule, key=lambda rule: len(rule.components))

        return sortedMolecule[-1]

    def getNamedMolecule(array, name):
        for molecule in sortMolecules(array, True):
            if molecule.name == name:
                return molecule

    speciesDict = {}
    # this array will contain all molecules that bind together
    pairedMolecules = []
    for x in sortMolecules(species.molecules, reverse=True):
        for y in x.components:
            if y.name not in speciesDict:
                speciesDict[y.name] = []
            speciesDict[y.name].append(x)
    # this array wil contain all molecules that dont bind to anything
    orphanedMolecules = [x for x in species.molecules]
    # seed the list of pairs from the seeds
    pairedMolecules = copy(bondSeeding)
    if bondSeeding:
        orphanedMolecules = [
            x for x in orphanedMolecules for y in bondSeeding if x not in y
        ]

    # determine how molecules bind together
    redundantBonds = []

    for x in sortMolecules(species.molecules, reverse=True):
        for component in [
            y for y in x.components if y.name.lower() in list(speciesDict.keys())
        ]:
            if x.name.lower() in speciesDict:
                if (x in speciesDict[component.name.lower()]) and component.name in [
                    y.name.lower() for y in speciesDict[x.name.lower()]
                ]:
                    for mol in speciesDict[x.name.lower()]:
                        if (
                            mol.name.lower() == component.name
                            and x != mol
                            and x in speciesDict[component.name]
                        ):
                            speciesDict[x.name.lower()].remove(mol)
                            speciesDict[component.name].remove(x)
                            if (
                                x not in orphanedMolecules
                                and mol not in orphanedMolecules
                            ):
                                # FIXME: is it necessary to remove double bonds
                                # in complexes?

                                lhs = set([])
                                rhs = set([])
                                repeatedFlag = False
                                for pair in pairedMolecules:
                                    if x in pair:
                                        lhs.add(pair[0])
                                        lhs.add(pair[1])
                                    elif mol in pair:
                                        rhs.add(pair[0])
                                        rhs.add(pair[1])
                                    # is this particular pair of molecules bound together?
                                    if x in pair and mol in pair:
                                        repeatedFlag = True
                                        break
                                # this pair already exists
                                if repeatedFlag:
                                    continue
                                redundantBonds.append([x, mol])
                                intersection = lhs.intersection(rhs)
                                redundantBonds[-1].extend(list(intersection))
                                if len(redundantBonds[-1]) < 3:
                                    redundantBonds.pop()
                                # continue
                            if (
                                [x, mol] not in bondSeeding
                                and [mol, x] not in bondSeeding
                                and [x, mol] not in bondExclusion
                                and [mol, x] not in bondExclusion
                            ):
                                pairedMolecules.append([x, mol])
                            if x in orphanedMolecules:
                                orphanedMolecules.remove(x)
                            if mol in orphanedMolecules:
                                orphanedMolecules.remove(mol)

    if len(redundantBonds) > 0:
        for x in redundantBonds:
            if database:
                atoAux.addAssumptions(
                    "redundantBonds",
                    tuple(sorted([y.name for y in x])),
                    database.assumptions,
                )
                atoAux.addAssumptions(
                    "redundantBondsMolecules",
                    (tuple(sorted([y.name for y in x])), moleculeName),
                    database.assumptions,
                )
            logMess(
                "WARNING:CTX001",
                "Redundant bonds detected between molecules {0} in species {1}".format(
                    [y.name for y in x], moleculeName
                ),
            )
    totalComplex = [set(x) for x in pairedMolecules]
    isContinuousFlag = True

    # iterate over orphaned and find unidirectional interactions
    # e.g. if a molecule has a previous known interaction with the
    # same kind of molecule, even if it has no available components
    # e.g. k-mers`

    for element in speciesDict:
        for individualMolecule in speciesDict[element]:
            if individualMolecule in orphanedMolecules:
                candidatePartner = [
                    x
                    for x in species.molecules
                    if x.name.lower() == element and x != individualMolecule
                ]
                if len(candidatePartner) == 1:
                    pairedMolecules.append([candidatePartner[0], individualMolecule])
                    orphanedMolecules.remove(individualMolecule)
    # determine which pairs form a continuous chain

    while isContinuousFlag:
        isContinuousFlag = False
        for idx in range(0, len(totalComplex) - 1):
            for idx2 in range(idx + 1, len(totalComplex)):
                if len([x for x in totalComplex[idx] if x in totalComplex[idx2]]) > 0:
                    totalComplex[idx] = totalComplex[idx].union(totalComplex[idx2])
                    totalComplex.pop(idx2)
                    isContinuousFlag = True
                    break
            if isContinuousFlag:
                break
    # now we process those molecules where we need to create a new component
    for element in orphanedMolecules:
        for mol1 in species.molecules:
            # when adding orphaned molecules make sure it's not already in
            # the list
            if mol1 == element and mol1 not in set().union(*totalComplex):
                totalComplex.append(set([mol1]))
    # now we process for those molecules we are not sure how do they bind
    while len(totalComplex) > 1:
        if len(totalComplex[0]) == 1 and len(totalComplex[1]) == 1:
            mol1 = list(totalComplex[0])[0]
            mol2 = list(totalComplex[1])[0]
        else:
            mol1, mol2 = solveComplexBinding(
                totalComplex,
                pathwaycommonsFlag,
                parser,
                database.prunnedDependencyGraph[moleculeName][0],
            )
        pairedMolecules.append([mol1, mol2])
        totalComplex[0] = totalComplex[0].union(totalComplex[1])
        totalComplex.pop(1)
    # totalComplex.extend(orphanedMolecules)
    return pairedMolecules


[docs]def getTrueTag(dependencyGraph, molecule):
    """
    given any modified or basic element it returns its basic
    name
    """
    if dependencyGraph[molecule] == []:
        return molecule
    elif dependencyGraph[molecule][0][0] == molecule:
        return molecule
    else:
        return getTrueTag(dependencyGraph, dependencyGraph[molecule][0][0])


[docs]def createCatalysisRBM(
    dependencyGraph,
    element,
    translator,
    reactionProperties,
    equivalenceDictionary,
    sbmlAnalyzer,
    database,
):
    """
    if it's a catalysis reaction create a new component/state
    """
    if dependencyGraph[element[0]][0][0] == element[0]:
        if element[0] not in translator:
            translator[element[0]] = createEmptySpecies(element[0])
    else:
        componentStateArray = []
        tmp = element[0]
        existingComponents = []
        memory = []
        forceActivationSwitch = False
        while dependencyGraph[tmp] != []:
            # what kind of catalysis are we dealing with
            # classification = identifyReaction(
            #                                  equivalenceDictionary,
            #                                  dependencyGraph[tmp][0][0],tmp)

            classifications = None
            if not classifications:
                classifications = identifyReaction(
                    equivalenceDictionary, dependencyGraph[tmp][0][0], tmp
                )
                classifications = (
                    classifications if classifications in reactionProperties else None
                )
                if classifications is not None:
                    classifications = [classifications]
            if not classifications:
                classifications = sbmlAnalyzer.findMatchingModification(
                    tmp, dependencyGraph[tmp][0][0]
                )

            if not classifications:
                classifications = sbmlAnalyzer.findMatchingModification(
                    element[0], dependencyGraph[tmp][0][0]
                )

            # if we know what classification it is then add the corresponding
            # components and states
            if classifications is not None:
                for classification in classifications:
                    componentStateArray.append(reactionProperties[classification])
                    # classificationArray.append([classification,
                    #                            tmp,dependencyGraph[tmp]
                    #                            [0][0]])
                    existingComponents.append(reactionProperties[classification][0])
            # if we don't know we can create a force 1:1 modification
            elif (
                database.forceModificationFlag
                and classifications is None
                and not forceActivationSwitch
            ):
                forceActivationSwitch = True
                baseName = getTrueTag(
                    dependencyGraph, dependencyGraph[element[0]][0][0]
                )

                species = createEmptySpecies(baseName)

                componentStateArray.append(["{0}".format(tmp), tmp])
                if not (
                    element[0] in database.userLabelDictionary
                    and database.userLabelDictionary[element[0]][0][0] == baseName
                ):
                    logMess(
                        "WARNING:LAE002",
                        "adding forced transformation: {0}:{1}:{2}".format(
                            baseName, dependencyGraph[element[0]][0][0], element[0]
                        ),
                    )
                # return
            # bail out if we couldn't figure out what modification it is
            elif classifications is None:
                logMess(
                    "DEBUG:MSC001",
                    "unregistered modification: {0}:{1}".format(
                        element[0], dependencyGraph[element[0]]
                    ),
                )
            memory.append(tmp)
            tmp = dependencyGraph[tmp][0][0]
            if tmp in memory:
                raise atoAux.CycleError(memory)
        baseName = getTrueTag(dependencyGraph, dependencyGraph[element[0]][0][0])

        species = createEmptySpecies(baseName)
        # use the already existing structure if its in the
        # translator,otherwise empty
        if baseName in translator:
            species = translator[baseName]
        # modifiedSpecies = deepcopy(translator[dependencyGraph[element[0]][0][0]])

        # modified species needs to start from the base speceis sine componentStateArray should contain the full set of modifications
        # check that this works correctly for double modifications
        modifiedSpecies = deepcopy(translator[baseName])
        # this counter is here for multi level modification events (e.g. double
        # phosporylation)
        modificationCounter = {
            componentState[0]: 2 for componentState in componentStateArray
        }
        for componentState in componentStateArray:
            addComponentToMolecule(species, baseName, componentState[0])
            addComponentToMolecule(modifiedSpecies, baseName, componentState[0])
            tmp = addStateToComponent(
                species, baseName, componentState[0], componentState[1]
            )
            if tmp == componentState[1]:
                addStateToComponent(
                    species,
                    baseName,
                    componentState[0],
                    componentState[1] + componentState[1],
                )
            # this modification was already activated so create a second
            # modification component
            if (
                addStateToComponent(
                    modifiedSpecies, baseName, componentState[0], componentState[1]
                )
                == componentState[1]
            ):
                componentName = "{0}{1}".format(
                    componentState[0], modificationCounter[componentState[0]]
                )
                modificationCounter[componentState[0]] += 1
                addComponentToMolecule(modifiedSpecies, baseName, componentName)
                addStateToComponent(
                    modifiedSpecies, baseName, componentName, componentState[1]
                )
            addStateToComponent(species, baseName, componentState[0], "0")
        # update the base species
        if len(componentStateArray) > 0:
            translator[baseName] = deepcopy(species)
            translator[element[0]] = modifiedSpecies


globalNumberGenerator = []


[docs]def getBondNumber(molecule1, molecule2):
    """
    keeps a model-level registry of of all the molecule pairs and returns a unique index
    """
    moleculeList = tuple(sorted([molecule1, molecule2]))
    if moleculeList not in globalNumberGenerator:
        globalNumberGenerator.append(moleculeList)

    return globalNumberGenerator.index(moleculeList)


[docs]def createBindingRBM(
    element,
    translator,
    dependencyGraph,
    bioGridFlag,
    pathwaycommonsFlag,
    parser,
    database,
):
    species = st.Species()

    # go over the sct and reuse existing stuff

    for molecule in dependencyGraph[element[0]][0]:
        if molecule in translator:
            tmpSpecies = translator[molecule]
            if molecule != getTrueTag(dependencyGraph, molecule):
                original = translator[getTrueTag(dependencyGraph, molecule)]
                updateSpecies(tmpSpecies, original.molecules[0])
            if tmpSpecies.molecules[0].name in database.constructedSpecies:
                tmpSpecies.molecules[0].trueName = molecule
            else:
                tmpSpecies.molecules[0].trueName = tmpSpecies.molecules[0].name
            species.addMolecule(deepcopy(tmpSpecies.molecules[0]))
        else:
            mol = st.Molecule(molecule)
            mol.trueName = molecule
            # dependencyGraph[molecule] = deepcopy(mol)
            species.addMolecule(mol)
    dependencyGraphCounter = Counter(dependencyGraph[element[0]][0])

    # XXX: this wont work for species with more than one molecule with the
    # same name
    changeFlag = False
    partialBonds = defaultdict(list)
    for partialUserEntry in database.partialUserLabelDictionary:
        partialCounter = Counter(partialUserEntry)
        if all(
            [partialCounter[x] <= dependencyGraphCounter[x] for x in partialCounter]
        ):
            changeFlag = True
            for molecule in database.partialUserLabelDictionary[
                partialUserEntry
            ].molecules:
                for molecule2 in species.molecules:
                    if molecule.name == molecule2.name:
                        for component in molecule.components:
                            for bond in component.bonds:
                                if molecule2.name not in [
                                    x.name for x in partialBonds[bond]
                                ]:
                                    partialBonds[bond].append(molecule2)
                        """
                        for component in molecule.components:
                            component2 = [x for x in molecule2.components if x.name == component.name]
                            # component already exists in species template
                            if component2:
                                if component.bonds:
                                    component2[0].bonds = component.bonds
                            else:
                                molecule2.addComponent(deepcopy(component))
                        """

    bondSeeding = [partialBonds[x] for x in partialBonds if x > 0]
    bondExclusion = [partialBonds[x] for x in partialBonds if x < 0]
    # how do things bind together?
    moleculePairsList = getComplexationComponents2(
        element[0],
        species,
        bioGridFlag,
        pathwaycommonsFlag,
        parser,
        bondSeeding,
        bondExclusion,
        database,
    )

    # moleculeCount = Counter([y for x in moleculePairsList for y in x])
    # print moleculeCount
    # moleculePairsList = [sorted(x) for x in moleculePairsList]
    # moleculePairsList.sort(key=lambda x: [-moleculeCount[x[0]],(str(x[0]), x[0],str(x[1]),x[1])])
    # TODO: update basic molecules with new components
    # translator[molecule[0].name].molecules[0].components.append(deepcopy(newComponent1))
    # translator[molecule[1].name].molecules[0].components.append(deepcopy(newComponent2))
    moleculeCounter = defaultdict(list)
    for molecule in moleculePairsList:
        flag = False

        # create an index on m0 and m1 depending on their name and repeats in
        # the species
        if molecule[0] not in moleculeCounter[molecule[0].name]:
            moleculeCounter[molecule[0].name].append(molecule[0])
        if molecule[1] not in moleculeCounter[molecule[1].name]:
            moleculeCounter[molecule[1].name].append(molecule[1])
        m0index = moleculeCounter[molecule[0].name].index(molecule[0])
        m1index = moleculeCounter[molecule[1].name].index(molecule[1])
        bondIdx = getBondNumber(
            "{0}{1}".format(molecule[0].name, m0index),
            "{0}{1}".format(molecule[1].name, m1index),
        )
        # add bonds where binding components already exist and they are not
        # occupied
        for component in molecule[0].components:
            if component.name == molecule[1].name.lower() and len(component.bonds) == 0:
                component.bonds.append(bondIdx)
                flag = True
                break
        if not flag:
            # create components if they dont exist already.
            # Add a bond afterwards
            newComponent1 = st.Component(molecule[1].name.lower())

            molecule[0].components.append(newComponent1)

            try:
                if newComponent1.name not in [
                    x.name for x in translator[molecule[0].name].molecules[0].components
                ]:
                    translator[molecule[0].name].molecules[0].components.append(
                        deepcopy(newComponent1)
                    )
            except KeyError as e:
                print(
                    "The translator doesn't know the molecule: {}".format(
                        molecule[0].name
                    )
                )
                raise e
            molecule[0].components[-1].bonds.append(bondIdx)
        flag = False
        # same thing for the other member of the bond
        for component in molecule[1].components:
            if component.name == molecule[0].name.lower() and len(component.bonds) == 0:
                component.bonds.append(bondIdx)
                flag = True
                break
        if not flag:
            newComponent2 = st.Component(molecule[0].name.lower())
            molecule[1].components.append(newComponent2)
            if molecule[0].name != molecule[1].name:
                if newComponent2.name not in [
                    x.name for x in translator[molecule[1].name].molecules[0].components
                ]:
                    translator[molecule[1].name].molecules[0].components.append(
                        deepcopy(newComponent2)
                    )
            molecule[1].components[-1].bonds.append(bondIdx)

    # update the translator
    translator[element[0]] = species


[docs]def atomize(
    dependencyGraph,
    weights,
    translator,
    reactionProperties,
    equivalenceDictionary,
    bioGridFlag,
    sbmlAnalyzer,
    database,
    parser,
):
    """
    The atomizer's main methods. Receives a dependency graph
    """
    redrawflag = True
    loops = 0
    while redrawflag and loops < 10:
        loops += 1
        bindingCounter = Counter()
        bindingFailureDict = {}

        for idx, element in enumerate(weights):
            # unnamed molecule?
            if len(element[0]) == 0:
                continue
            # 0 molecule
            if element[0] == "0":
                continue
            # user defined molecules to be the zero molecule
            if dependencyGraph[element[0]] == [["0"]]:
                zeroSpecies = st.Species()
                zeroMolecule = st.Molecule("0")
                zeroSpecies.addMolecule(zeroMolecule)
                translator[element[0]] = zeroSpecies
                continue
            # undivisible molecules

            elif dependencyGraph[element[0]] == []:
                if element[0] not in translator:
                    translator[element[0]] = createEmptySpecies(element[0])
            else:
                if len(dependencyGraph[element[0]][0]) == 1:
                    # catalysis
                    createCatalysisRBM(
                        dependencyGraph,
                        element,
                        translator,
                        reactionProperties,
                        equivalenceDictionary,
                        sbmlAnalyzer,
                        database,
                    )
                else:
                    try:
                        createBindingRBM(
                            element,
                            translator,
                            dependencyGraph,
                            bioGridFlag,
                            database.pathwaycommons,
                            parser,
                            database,
                        )

                    except BindingException as e:
                        for c in e.combinations:
                            bindingCounter[c] += 1
                            bindingFailureDict[element[0]] = e.combinations
                        logMess(
                            "DEBUG:ATO003",
                            "We don't know how {0} binds together in complex {1}. Not atomizing".format(
                                e.value, element[0]
                            ),
                        )

                        # there awas an issue during binding, don't atomize
                        translator[element[0]] = createEmptySpecies(element[0])

        # evaluate species that weren't bound properly and see if we can get information from all over the model to find the right binding partner
        bindingTroubleLog = defaultdict(list)
        modifiedPairs = set()

        redrawflag = False
        for molecule in bindingFailureDict:
            bindingWinner = defaultdict(list)
            for candidateTuple in bindingFailureDict[molecule]:
                bindingWinner[bindingCounter[candidateTuple]].append(candidateTuple)
            bestBindingCandidates = bindingWinner[max(bindingWinner.keys())]
            if len(bestBindingCandidates) > 1:
                bindingTroubleLog[tuple(sorted(bestBindingCandidates))].append(molecule)
            else:
                bindingPair = bestBindingCandidates[0]
                if bindingPair not in modifiedPairs:
                    modifiedPairs.add(bindingPair)
                else:
                    continue
                c1 = st.Component(bindingPair[1].lower())
                c2 = st.Component(bindingPair[0].lower())
                molecule1 = translator[
                    translator[bindingPair[0]].molecules[0].name
                ].molecules[0]
                molecule2 = translator[
                    translator[bindingPair[1]].molecules[0].name
                ].molecules[0]

                molecule1.addComponent(c1)
                molecule2.addComponent(c2)
                redrawflag = True
                logMess(
                    "INFO:ATO031",
                    "Determining that {0} binds together based on frequency of the bond in the reaction network.".format(
                        bindingPair
                    ),
                )
    for trouble in bindingTroubleLog:
        logMess(
            "ERROR:ATO202",
            "{0}:{1}:We need information to resolve the bond structure of these complexes . \
Please choose among the possible binding candidates that had the most observed frequency in the reaction network or provide a new one".format(
                bindingTroubleLog[trouble], trouble
            ),
        )
    # renaming of components as needed for readability.
    for spname in translator:
        species = translator[spname]
        orig_species_st2 = species.str2()
        for molec in species.molecules:
            for component in molec.components:
                if component.name.startswith("_"):
                    # component starts with "_", adjust
                    component.name = re.sub("^(_)+", "", component.name)
                if component.name.endswith("_"):
                    # component starts with "_", adjust
                    component.name = re.sub("(_)+$", "", component.name)
                numbered_states = False
                for ist, state in enumerate(component.states):
                    orig_state = state
                    if state.startswith("_"):
                        component.states[ist] = re.sub("^(_)+", "", state)
                    if state.endswith("_"):
                        component.states[ist] = re.sub("(_)+$", "", state)
                    if (component.states[ist].lower() in component.name.lower()) and (
                        len(component.states[ist]) > 5
                    ):
                        # the component state is named after component, rename
                        numbered_states = True
                    # ensure active state is updated
                    if orig_state == component.activeState:
                        component.activeState = component.states[ist]
                if numbered_states:
                    ctr = 1
                    for ist, state in enumerate(component.states):
                        orig_state = state
                        if state == "0":
                            continue
                        else:
                            component.states[ist] = str(ctr)
                            ctr += 1
                        # ensure active state is updated
                        if orig_state == component.activeState:
                            component.activeState = component.states[ist]
        # log the info
        new_species_str2 = species.str2()
        if new_species_str2 != orig_species_st2:
            logMess(
                "INFO:ATO032",
                f"Renamed species from {orig_species_st2} to {new_species_str2}",
            )
    # renaming can introduce name clashes of components, we need to renumber them
    # if necessary. First pass for counting
    for spname in translator:
        species = translator[spname]
        orig_species_st2 = species.str2()
        for molec in species.molecules:
            comp_counter = {}
            for component in molec.components:
                # add to counter if we don't have it
                if component.name not in comp_counter:
                    comp_counter[component.name] = -1
                # make sure we have the right count
                else:
                    if comp_counter[component.name] < 0:
                        comp_counter[component.name] = 2
                    else:
                        comp_counter[component.name] += 1
            # remove unnecessary stuff
            to_remove = []
            for comp in comp_counter:
                if comp_counter[comp] < 0:
                    to_remove.append(comp)
            for tr in to_remove:
                comp_counter.pop(tr)
            # rename the components
            comp_counter_2 = {}
            for component in molec.components:
                # check if we have multiple of this component
                if component.name not in comp_counter:
                    continue
                # we do have multiples, check where we are at
                if component.name not in comp_counter_2:
                    # our first encounter
                    comp_counter_2[component.name] = 1
                    component.name += "1"
                else:
                    comp_counter_2[component.name] += 1
                    component.name += f"{comp_counter_2[component.name]}"
        for spname in translator:
            species = translator[spname]
            for molec in species.molecules:
                for comp in molec.components:
                    comp.states = sorted(comp.states)
        # report
        new_species_str2 = species.str2()
        if new_species_str2 != orig_species_st2:
            logMess(
                "INFO:ATO033",
                f"Renumbered components from {orig_species_st2} to {new_species_str2}",
            )


[docs]def updateSpecies(species, referenceMolecule):
    flag = False
    for moleculeStructure in species.molecules:
        if moleculeStructure.name == referenceMolecule.name:
            for component in referenceMolecule.components:
                count = [x.name for x in referenceMolecule.components].count(
                    component.name
                )
                count -= [x.name for x in moleculeStructure.components].count(
                    component.name
                )
                newComponent = st.Component(component.name)
                # if len(component.states) > 0:
                #    newComponent.addState('0')
                if count > 0:
                    for _ in range(0, count):
                        # just make a copy of the reference component and set active state to 0
                        componentCopy = deepcopy(component)
                        componentCopy.setActiveState("0")
                        moleculeStructure.addComponent(componentCopy)
                elif count < 0:
                    for _ in range(0, -count):
                        # FIXME: does not fully copy the states
                        referenceMolecule.addComponent(deepcopy(newComponent))
                        flag = True
                elif count == 0:
                    localComponents = [
                        x
                        for x in moleculeStructure.components
                        if x.name == component.name
                    ]
                    referenceComponents = [
                        x
                        for x in referenceMolecule.components
                        if x.name == component.name
                    ]
                    if [x.states for x in localComponents] != [
                        x.states for x in referenceComponents
                    ]:
                        for lc in localComponents:
                            for rc in referenceComponents:
                                for ls in lc.states:
                                    if ls not in rc.states:
                                        rc.addState(ls, update=False)
                                for rs in rc.states:
                                    if rs not in lc.states:
                                        lc.addState(rs, update=False)

            for component in moleculeStructure.components:
                count = [x.name for x in referenceMolecule.components].count(
                    component.name
                )
                count -= [x.name for x in moleculeStructure.components].count(
                    component.name
                )
                newComponent = st.Component(component.name)
                if len(component.states) > 0:
                    newComponent.addState(component.states[0])
                    newComponent.addState("0")
                if count > 0:
                    for idx in range(0, count):
                        moleculeStructure.addComponent(deepcopy(newComponent))
                elif count < 0:
                    for idx in range(0, -count):
                        referenceMolecule.addComponent(deepcopy(newComponent))
                        flag = True

    return flag


[docs]def propagateChanges(translator, dependencyGraph):
    flag = True
    while flag:
        flag = False
        for dependency in dependencyGraph:
            if dependencyGraph[dependency] == []:
                continue
            for molecule in dependencyGraph[dependency][0]:
                try:
                    if updateSpecies(
                        translator[dependency],
                        translator[getTrueTag(dependencyGraph, molecule)].molecules[0],
                    ):
                        flag = True
                except:
                    logMess(
                        "CRITICAL:Program", "Species is not being properly propagated"
                    )
                    flag = False


[docs]def sanityCheck(database):
    """
    checks for critical atomization errors like isomorphism
    """
    stringrep = {x: str(database.translator[x]) for x in database.translator}
    repeats = set()
    for key in range(0, len(list(database.translator.keys())) - 1):
        for key2 in range(key + 1, len(list(database.translator.keys()))):
            if (
                stringrep[list(database.translator.keys())[key]]
                == stringrep[list(database.translator.keys())[key2]]
            ):
                repeats.add(
                    (
                        list(database.translator.keys())[key],
                        list(database.translator.keys())[key2],
                    )
                )
    removed = []
    for repeat in repeats:
        temp = sorted(repeat)
        logMess(
            "ERROR:SCT241",
            "{0}:{1}:produce the same translation:{2}:{1}:was emptied".format(
                temp[0], temp[1], database.prunnedDependencyGraph[temp[0]][0]
            ),
        )
        if temp[1] in list(database.translator.keys()) and repeat[1] not in removed:
            database.translator.pop(repeat[1])
            removed.append(repeat[1])


[docs]def transformMolecules(
    parser,
    database,
    configurationFile,
    namingConventions,
    speciesEquivalences=None,
    bioGridFlag=False,
    memoizedResolver=True,
):
    """
    main method. Receives a parser configuration, a configurationFile and a
    list of user defined species equivalences and returns a dictionary
    containing an atomized version of the model
    Args:
        parser: data structure containing the reactions and species we will use
        database: data structure containining the result of the outgoing translation
        configurationFile:
        speciesEquivalences: predefined species
    """
    """
    import cProfile, pstats, StringIO
    pr = cProfile.Profile()
    pr.enable()
    """
    database.parser = parser
    # ASS - Gotta pass in the option to memoize here
    sctsolver = resolveSCT.SCTSolver(database, memoizedResolver)
    database = sctsolver.createSpeciesCompositionGraph(
        parser,
        configurationFile,
        namingConventions,
        speciesEquivalences=speciesEquivalences,
        bioGridFlag=bioGridFlag,
    )

    for element in database.artificialEquivalenceTranslator:
        if element not in database.eequivalenceTranslator:
            database.eequivalenceTranslator[element] = []
        database.eequivalenceTranslator[element].extend(
            database.artificialEquivalenceTranslator[element]
        )

    # special handling for double modifications like double phosporylation
    # FIXME: this needs to be done in a cleaner way(e.g. getting them
    # from a file instead of being hardcoded)
    doubleModifications = {"Double-Phosporylation": "Phosporylation"}

    for element in doubleModifications:
        if doubleModifications[element] not in database.eequivalenceTranslator:
            continue
        if element not in database.eequivalenceTranslator:
            database.eequivalenceTranslator[element] = []

        baseElements = [
            x[0] for x in database.eequivalenceTranslator[doubleModifications[element]]
        ]
        modifiedElements = [
            x[1] for x in database.eequivalenceTranslator[doubleModifications[element]]
        ]
        # deleteEquivalences = [baseElements.index(x) for x in baseElements if x in modifiedElements]

        deleteEquivalences = [
            (x, modifiedElements[baseElements.index(x)])
            for x in baseElements
            if x in modifiedElements
        ]

        for eq in deleteEquivalences:
            if eq not in database.eequivalenceTranslator[element]:
                database.eequivalenceTranslator[element].append(eq)

        for eq in deleteEquivalences:
            if eq in database.eequivalenceTranslator[doubleModifications[element]]:
                database.eequivalenceTranslator[doubleModifications[element]].remove(eq)

    for modification in database.tmpEquivalence:
        for candidates in database.tmpEquivalence[modification]:
            for instance in candidates:
                atoAux.addToDependencyGraph(
                    database.eequivalenceTranslator, modification, instance
                )

    database.weights = sorted(
        database.weights, key=lambda rule: (rule[1], len(rule[0]))
    )
    atomize(
        database.prunnedDependencyGraph,
        database.weights,
        database.translator,
        database.reactionProperties,
        database.eequivalenceTranslator2,
        bioGridFlag,
        database.sbmlAnalyzer,
        database,
        parser,
    )
    onlySynDec = (
        len([x for x in database.classifications if x not in ["Generation", "Decay"]])
        == 0
    )
    propagateChanges(database.translator, database.prunnedDependencyGraph)

    # check for isomorphism
    sanityCheck(database)
    """
    pr.disable()
    s = StringIO.StringIO()
    sortby = 'cumulative'
    ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
    ps.print_stats(10)
    print s.getvalue()
    """
    # database.assumptions = deepcopy(assumptions)
    # assumptions.clear()
    # ASS: Adding atomized new molecules to the molecule list
    for molecule in database.translator.keys():
        if molecule not in database.parser.used_molecules:
            database.parser.used_molecules.append(molecule)

    return database.translator, onlySynDec