# -*- coding: utf-8 -*-
"""
Created on Thu Mar 22 13:11:38 2012
@author: proto
"""
import enum
import imp
from pyparsing import Word, Suppress, Optional, alphanums, Group, ZeroOrMore
import numpy as np
import json
import itertools
import bionetgen.atomizer.utils.structures as st
from copy import deepcopy, copy
from . import detectOntology
import re
import difflib
from bionetgen.atomizer.utils.util import logMess
from collections import defaultdict
import itertools
import math
from collections import Counter
import re
import os
from bionetgen.atomizer.utils.util import pmemoize as memoize
"""
This file in general classifies rules according to the information contained in
the json config file for classyfying rules according to their reactants/products
"""
[docs]@memoize
def get_close_matches(match, dataset, cutoff=0.6):
return difflib.get_close_matches(match, dataset, cutoff=cutoff)
[docs]@memoize
def sequenceMatcher(a, b):
"""
compares two strings ignoring underscores
"""
return difflib.SequenceMatcher(lambda x: x == "_", a, b).ratio()
name = Word(alphanums + "_-") + ":"
species = (
Word(alphanums + "_" + ":#-")
+ Suppress("()")
+ Optional(Suppress("@" + Word(alphanums + "_-")))
) + ZeroOrMore(
Suppress("+")
+ Word(alphanums + "_" + ":#-")
+ Suppress("()")
+ Optional(Suppress("@" + Word(alphanums + "_-")))
)
rate = Word("-" + alphanums + "()")
grammar = Suppress(Optional(name)) + (
(Group(species) | "0")
+ Suppress(Optional("<") + "->")
+ (Group(species) | "0")
+ Suppress(rate)
)
[docs]@memoize
def parseReactions(reaction, specialSymbols=""):
if reaction.startswith("#"):
return None
result = grammar.parseString(reaction).asList()
if len(result) < 2:
result = [result, []]
if "<->" in reaction and len(result[0]) == 1 and len(result[1]) == 2:
result.reverse()
return result
[docs]def addToDependencyGraph(dependencyGraph, label, value):
if label not in dependencyGraph:
dependencyGraph[label] = []
if value not in dependencyGraph[label] and value != []:
dependencyGraph[label].append(value)
[docs]class SBMLAnalyzer:
def __init__(
self,
modelParser,
configurationFile,
namingConventions,
speciesEquivalences=None,
conservationOfMass=True,
):
self.modelParser = modelParser
self.configurationFile = configurationFile
self.namingConventions = detectOntology.loadOntology(namingConventions)
self.userNamingConventions = copy(self.namingConventions)
self.speciesEquivalences = speciesEquivalences
self.userEquivalencesDict = None
self.lexicalSpecies = []
self.conservationOfMass = conservationOfMass
[docs] def distanceToModification(self, particle, modifiedElement, translationKeys):
posparticlePos = [
m.start() + len(particle) for m in re.finditer(particle, modifiedElement)
]
preparticlePos = [m.start() for m in re.finditer(particle, modifiedElement)]
keyPos = [m.start() for m in re.finditer(translationKeys, modifiedElement)]
distance = [abs(y - x) for x in posparticlePos for y in keyPos]
distance.extend([abs(y - x) for x in preparticlePos for y in keyPos])
distance.append(9999)
return min(distance)
[docs] def fuzzyArtificialReaction(self, baseElements, modifiedElement, molecules):
"""
in case we don't know how a species is composed but we know its base
elements, try to get it by concatenating its basic reactants
"""
import collections
compare = lambda x, y: collections.Counter(x) == collections.Counter(y)
(
equivalenceTranslator,
translationKeys,
conventionDict,
) = self.processNamingConventions2(molecules)
indirectEquivalenceTranslator = {x: [] for x in equivalenceTranslator}
self.processFuzzyReaction(
[baseElements, modifiedElement],
translationKeys,
conventionDict,
indirectEquivalenceTranslator,
)
newBaseElements = baseElements
for modification in indirectEquivalenceTranslator:
for element in indirectEquivalenceTranslator[modification]:
newBaseElements = [
element[2][1] if x == element[2][0] else x for x in newBaseElements
]
if compare(baseElements, newBaseElements):
return None
return newBaseElements
[docs] def analyzeSpeciesModification2(
self, baseElement, modifiedElement, partialAnalysis
):
"""
A method to read modifications within complexes.
"""
def index_min(values):
return min(range(len(values)), key=values.__getitem__)
(
equivalenceTranslator,
translationKeys,
conventionDict,
) = self.processNamingConventions2([baseElement, modifiedElement])
differencePosition = [
(i, x)
for i, x in enumerate(difflib.ndiff(baseElement, modifiedElement))
if x.startswith("+")
]
tmp = ""
lastIdx = 0
newDifferencePosition = []
for i in range(len(differencePosition)):
tmp += differencePosition[i][1][-1]
if tmp in translationKeys:
newDifferencePosition.append(
(
(differencePosition[lastIdx][0] + differencePosition[i][0]) / 2,
tmp,
)
)
tmp = ""
lastIdx = i
differencePosition = newDifferencePosition
if len(differencePosition) == 0:
return None, None, None
sortedPartialAnalysis = sorted(partialAnalysis, key=len, reverse=True)
tokenPosition = []
tmpModifiedElement = modifiedElement
for token in sortedPartialAnalysis:
sequenceMatcher = difflib.SequenceMatcher(None, token, tmpModifiedElement)
# sequenceMatcher2 = difflib.SequenceMatcher(None,token,baseElement)
modifiedMatchingBlocks = [
m.span() for m in re.finditer(token, tmpModifiedElement)
]
baseMatchingBlocks = [m.span() for m in re.finditer(token, baseElement)]
# matchingBlocks = [x for x in modifiedMatchingBlocks for y in baseMatching Blocks if ]
if len(modifiedMatchingBlocks) > 0 and len(baseMatchingBlocks) > 0:
# select the matching block with the lowest distance to the base matching block
matchingBlockIdx = index_min(
[
min(
[
abs((y[1] + y[0]) / 2 - (x[1] + x[0]) / 2)
for y in baseMatchingBlocks
]
)
for x in modifiedMatchingBlocks
]
)
matchingBlock = modifiedMatchingBlocks[matchingBlockIdx]
tmpModifiedElement = list(tmpModifiedElement)
for idx in range(matchingBlock[0], matchingBlock[1]):
tmpModifiedElement[idx] = "_"
tmpModifiedElement = "".join(tmpModifiedElement)
tokenPosition.append((matchingBlock[0], matchingBlock[1] - 1))
else:
# try fuzzy search
sequenceMatcher = difflib.SequenceMatcher(
None, token, tmpModifiedElement
)
match = "".join(
tmpModifiedElement[j : j + n]
for i, j, n in sequenceMatcher.get_matching_blocks()
if n
)
if (len(match)) / float(len(token)) < 0.8:
tokenPosition.append([999999999])
else:
tmp = [
i
for i, y in enumerate(difflib.ndiff(token, tmpModifiedElement))
if not y.startswith("+")
]
if tmp[-1] - tmp[0] > len(token) + 5:
tokenPosition.append([999999999])
continue
tmpModifiedElement = list(tmpModifiedElement)
for idx in tmp:
if idx < len(tmpModifiedElement):
tmpModifiedElement[idx] = "_"
tmpModifiedElement = "".join(tmpModifiedElement)
tmp = [tmp[0], tmp[-1] - 1]
tokenPosition.append(tmp)
intersection = []
for difference in differencePosition:
distance = []
for token in tokenPosition:
distance.append(
min([abs(difference[0] - subtoken) for subtoken in token])
)
closestToken = sortedPartialAnalysis[index_min(distance)]
# if difference[1] in conventionDict:
intersection.append([difference[1], closestToken, min(distance)])
minimumToken = min(intersection, key=lambda x: x[2])
if intersection:
return minimumToken[1], translationKeys, equivalenceTranslator
return None, None, None
[docs] def analyzeSpeciesModification(self, baseElement, modifiedElement, partialAnalysis):
"""
a method for trying to read modifications within complexes
This is only possible once we know their internal structure
(this method is called after the creation and resolving of the dependency
graph)
"""
(
equivalenceTranslator,
translationKeys,
conventionDict,
) = self.processNamingConventions2([baseElement, modifiedElement])
scores = []
if len(translationKeys) == 0:
"""
there's no clear lexical path between reactant and product
"""
return None, None, None
for particle in partialAnalysis:
distance = 9999
comparisonElement = max(baseElement, modifiedElement, key=len)
if re.search("(_|^){0}(_|$)".format(particle), comparisonElement) == None:
distance = self.distanceToModification(
particle, comparisonElement, translationKeys[0]
)
score = difflib.ndiff(particle, modifiedElement)
else:
# FIXME: make sure we only do a search on those variables that are viable
# candidates. this is once again fuzzy string matchign. there should
# be a better way of doing this with difflib
permutations = set(
[
"_".join(x)
for x in itertools.permutations(partialAnalysis, 2)
if x[0] == particle
]
)
if all([x not in modifiedElement for x in permutations]):
distance = self.distanceToModification(
particle, comparisonElement, translationKeys[0]
)
score = difflib.ndiff(particle, modifiedElement)
# FIXME:tis is just an ad-hoc parameter in terms of how far a mod is from a species name
# use something better
if distance < 4:
scores.append([particle, distance])
if len(scores) > 0:
winner = scores[[x[1] for x in scores].index(min([x[1] for x in scores]))][
0
]
else:
winner = None
if winner:
return winner, translationKeys, equivalenceTranslator
return None, None, None
[docs] def findMatchingModification(self, particle, species):
@memoize
def findMatchingModificationHelper(particle, species):
difference = difflib.ndiff(species, particle)
differenceList = tuple([x for x in difference if "+" in x])
if differenceList in self.namingConventions["patterns"]:
return [self.namingConventions["patterns"][differenceList]]
fuzzyKey = "".join([x[2:] for x in differenceList])
differenceList = self.testAgainstExistingConventions(
fuzzyKey, self.namingConventions["modificationList"]
)
# can we state the modification as the combination of multiple modifications
if differenceList:
classificationList = []
for x in differenceList[0]:
differenceKey = tuple(["+ {0}".format(letter) for letter in x])
classificationList.append(
self.namingConventions["patterns"][differenceKey]
)
return classificationList
return None
return findMatchingModificationHelper(particle, species)
[docs] def greedyModificationMatching(self, speciesString, referenceSpecies):
"""
recursive function trying to map a given species string to a string permutation of the strings in reference species
>>> sa = SBMLAnalyzer(None,'./config/reactionDefinitions.json','./config/namingConventions.json')
>>> sorted(sa.greedyModificationMatching('EGF_EGFR',['EGF','EGFR']))
['EGF', 'EGFR']
>>> sorted(sa.greedyModificationMatching('EGF_EGFR_2_P_Grb2',['EGF','EGFR','EGF_EGFR_2_P','Grb2']))
['EGF_EGFR_2_P', 'Grb2']
>>> sorted(sa.greedyModificationMatching('A_B_C_D',['A','B','C','C_D','A_B_C','A_B']))
['A_B', 'C_D']
"""
bestMatch = ["", 0]
finalMatches = []
blacklist = []
while len(blacklist) < len(referenceSpecies):
localReferenceSpecies = [
x
for x in referenceSpecies
if x not in blacklist and len(x) <= len(speciesString)
]
for species in localReferenceSpecies:
if (
species in speciesString
and len(species) > bestMatch[1]
and species != speciesString
):
bestMatch = [species, len(species)]
if bestMatch != ["", 0]:
result = self.greedyModificationMatching(
speciesString.replace(bestMatch[0], ""), referenceSpecies
)
finalMatches = [bestMatch[0]]
if result == -1:
finalMatches = []
blacklist.append(bestMatch[0])
bestMatch = ["", 0]
continue
elif result != -2:
finalMatches.extend(result)
break
elif len([x for x in speciesString if x != "_"]) > 0:
return -1
else:
return -2
return finalMatches
[docs] def findClosestModification(
self, particles, species, annotationDict, originalDependencyGraph
):
"""
maps a set of particles to the complete set of species using lexical analysis. This step is done
independent of the reaction network.
"""
# ASS2019 - in Python3 species is a dictKey object and can't be marshaled,
# this should be both p2 and p3 compatible
species = list(species)
equivalenceTranslator = {}
dependencyGraph = {}
localSpeciesDict = defaultdict(lambda: defaultdict(list))
def analyzeByParticle(
splitparticle,
species,
equivalenceTranslator=equivalenceTranslator,
dependencyGraph=dependencyGraph,
):
basicElements = []
composingElements = []
splitpindex = -1
while (splitpindex + 1) < len(splitparticle):
splitpindex += 1
splitp = splitparticle[splitpindex]
if splitp in species:
closestList = [splitp]
similarList = get_close_matches(splitp, species)
similarList = [
x for x in similarList if x != splitp and len(x) < len(splitp)
]
similarList = [[x, splitp] for x in similarList]
if len(similarList) > 0:
for similarity in similarList:
# compare close lexical proximity
fuzzyList = self.processAdHocNamingConventions(
similarity[0],
similarity[1],
localSpeciesDict,
False,
species,
)
for reaction, tag, modifier in fuzzyList:
if modifier != None and all(
["-" not in x for x in modifier]
):
logMess(
"INFO:LAE001",
"Lexical relationship inferred between \
{0}, user information confirming it is required".format(
similarity
),
)
else:
closestList = get_close_matches(splitp, species)
closestList = [x for x in closestList if len(x) < len(splitp)]
# if theres nothing in the species list i can find a lexical
# neighbor from, then try to create one based on my two
# positional neighbors
if closestList == []:
flag = True
# do i get something by merging with the previous component?
if len(composingElements) > 0:
tmp, tmp2 = analyzeByParticle(
[composingElements[-1] + "_" + splitp], species
)
if tmp != [] and tmp2 != []:
flag = False
splitp = composingElements[-1] + "_" + splitp
composingElements.pop()
closestList = tmp
(
localEquivalenceTranslator,
_,
_,
) = self.processNamingConventions2([tmp[0], tmp2[0]])
for element in localEquivalenceTranslator:
if element not in equivalenceTranslator:
equivalenceTranslator[element] = []
equivalenceTranslator[element].extend(
localEquivalenceTranslator[element]
)
for instance in localEquivalenceTranslator[element]:
addToDependencyGraph(
dependencyGraph, instance[1], [instance[0]]
)
# do i get something by merging with the next component?
if flag and splitpindex + 1 != len(splitparticle):
tmp, tmp2 = analyzeByParticle(
[splitp + "_" + splitparticle[splitpindex + 1]], species
)
if tmp != [] and tmp2 != []:
splitp = splitp + "_" + splitparticle[splitpindex + 1]
splitpindex += 1
closestList = tmp
(
localEquivalenceTranslator,
_,
_,
) = self.processNamingConventions2([tmp[0], tmp2[0]])
for element in localEquivalenceTranslator:
if element not in equivalenceTranslator:
equivalenceTranslator[element] = []
equivalenceTranslator[element].append(
localEquivalenceTranslator[element]
)
for instance in localEquivalenceTranslator[element]:
addToDependencyGraph(
dependencyGraph, instance[1], [instance[0]]
)
else:
return [], []
elif flag:
return [], []
basicElements.append(min(closestList, key=len))
# if what i have is a known compound just add it
if splitp in species:
composingElements.append(splitp)
# if not create it
else:
closestList = get_close_matches(splitp, species)
closestList = [x for x in closestList if len(x) < len(splitp)]
flag = False
for element in closestList:
(
localEquivalenceTranslator,
_,
_,
) = self.processNamingConventions2([element, splitp])
if len(list(localEquivalenceTranslator.keys())) == 0:
basicElements = []
composingElements = []
for element in localEquivalenceTranslator:
if element not in equivalenceTranslator:
equivalenceTranslator[element] = []
equivalenceTranslator[element].append(
localEquivalenceTranslator[element]
)
for instance in localEquivalenceTranslator[element]:
addToDependencyGraph(
dependencyGraph, instance[1], [instance[0]]
)
flag = True
if flag:
composingElements.append(splitp)
return basicElements, composingElements
additionalHandling = []
# lexical handling
for particle in sorted(particles, key=len):
composingElements = []
basicElements = []
# can you break it down into small bites?
if "_" in particle:
splitparticle = particle.split("_")
# print('---',splitparticle)
splitparticle = [x for x in splitparticle if x]
# print(splitparticle)
basicElements, composingElements = analyzeByParticle(
splitparticle, species
)
if basicElements == composingElements and basicElements:
closeMatches = get_close_matches(particle, species)
matches = [
x
for x in closeMatches
if len(x) < len(particle) and len(x) >= 3
]
for match in matches:
difference = difflib.ndiff(match, particle)
differenceList = tuple([x for x in difference if "+" in x])
if differenceList in self.namingConventions["patterns"]:
logMess(
"INFO:LAE005",
"matching {0}={1}".format(particle, [match]),
)
addToDependencyGraph(dependencyGraph, particle, [match])
if len(matches) > 0:
continue
elif (
particle not in composingElements
and composingElements != []
and all([x in species for x in composingElements])
):
addToDependencyGraph(dependencyGraph, particle, composingElements)
for element in composingElements:
if element not in dependencyGraph:
addToDependencyGraph(dependencyGraph, element, [])
if element not in particles:
additionalHandling.append(element)
continue
else:
for basicElement in basicElements:
if basicElement in particle and basicElement != particle:
fuzzyList = self.processAdHocNamingConventions(
basicElement, particle, localSpeciesDict, False, species
)
if self.testAgainstExistingConventions(
fuzzyList[0][1],
self.namingConventions["modificationList"],
):
addToDependencyGraph(
dependencyGraph, particle, [basicElement]
)
logMess(
"INFO:LAE005",
"{0} can be mapped to {1} through existing naming conventions".format(
particle, [basicElement]
),
)
break
continue
# if bottom up doesn't work try a top down approach
for comparisonParticle in particles:
if particle == comparisonParticle:
continue
# try to map remaining orphaned molecules to each other based on simple, but known modifications
if comparisonParticle in particle:
fuzzyList = self.processAdHocNamingConventions(
particle, comparisonParticle, localSpeciesDict, False, species
)
if self.testAgainstExistingConventions(
fuzzyList[0][1], self.namingConventions["modificationList"]
):
if (
particle in annotationDict
and comparisonParticle in annotationDict
):
baseSet = set(
[
y
for x in annotationDict[particle]
for y in annotationDict[particle][x]
]
)
modSet = set(
[
y
for x in annotationDict[comparisonParticle]
for y in annotationDict[comparisonParticle][x]
]
)
if len(baseSet.intersection(modSet)) == 0:
baseDB = set(
[
x.split("/")[-2]
for x in baseSet
if "identifiers.org" in x
]
)
modDB = set(
[
x.split("/")[-2]
for x in modSet
if "identifiers.org" in x
]
)
# we stil ahve to check that they both reference the same database
if len(baseDB.intersection(modDB)) > 0:
logMess(
"ERROR:ANN202",
"{0}:{1}:can be mapped through naming conventions but the annotation information does not match".format(
particle, comparisonParticle
),
)
continue
addToDependencyGraph(
dependencyGraph, particle, [comparisonParticle]
)
logMess(
"INFO:LAE005",
"{0} can be mapped to {1} through existing naming conventions".format(
particle, [comparisonParticle]
),
)
break
else:
common_root = detectOntology.findLongestSubstring(
particle, comparisonParticle
)
# some arbitrary threshold of what makes a good minimum lenght for the common root
if (
len(common_root) > 0
and common_root not in originalDependencyGraph
):
fuzzyList = self.processAdHocNamingConventions(
common_root,
comparisonParticle,
localSpeciesDict,
False,
species,
)
fuzzyList2 = self.processAdHocNamingConventions(
common_root, particle, localSpeciesDict, False, species
)
particleMap = self.testAgainstExistingConventions(
fuzzyList[0][1], self.namingConventions["modificationList"]
)
compParticleMap = (
fuzzyList2,
self.testAgainstExistingConventions(
fuzzyList2[0][1],
self.namingConventions["modificationList"],
),
)
if particleMap and compParticleMap:
if (
particle in annotationDict
and comparisonParticle in annotationDict
):
baseSet = set(
[
y
for x in annotationDict[particle]
for y in annotationDict[particle][x]
]
)
modSet = set(
[
y
for x in annotationDict[comparisonParticle]
for y in annotationDict[comparisonParticle][x]
]
)
if len(baseSet.intersection(modSet)) == 0:
logMess(
"ERROR:ANN202",
"{0}:{1}:can be mapped through naming conventions but the annotation information does not match".format(
particle, comparisonParticle
),
)
break
addToDependencyGraph(
dependencyGraph, particle, [common_root]
)
addToDependencyGraph(
dependencyGraph, comparisonParticle, [common_root]
)
addToDependencyGraph(dependencyGraph, common_root, [])
logMess(
"INFO:LAE006",
"{0}:{1}:can be mapped together through new common molecule {2} by existing naming conventions".format(
particle, comparisonParticle, common_root
),
)
break
# if len(additionalHandling) > 0:
# print(self.findClosestModification(set(additionalHandling),species))
return dependencyGraph, equivalenceTranslator
[docs] def loadConfigFiles(self, fileName):
"""
the reactionDefinition file must contain the definitions of the basic reaction types
we wnat to parse and what are the requirements of a given reaction type to be considered
as such
"""
reactionDefinition = ""
if fileName is not None:
if fileName == "":
return []
if os.path.isfile(fileName):
with open(fileName, "r") as fp:
reactionDefinition_new = json.load(fp)
# adjust for new file format and keep propagating the same
# object format downstream so we don't have to deal with
# downstream changes
# let's start with binding interaction/complexDefinition block
reactionDefinition = {}
if "binding_interactions" in reactionDefinition_new:
reactionDefinition["complexDefinition"] = []
# convert new JSON format to old data format
# we'll need to make sure symmetrical ones are created
# this is a list of pairs
for binding_pair in reactionDefinition_new["binding_interactions"]:
first, second = binding_pair[0], binding_pair[1]
if isinstance(first, dict) or isinstance(second, dict):
# TODO: Implement dictionaries for binding partners in the future
raise NotImplementedError(
"Dictionaries for binding pairs are not implemented yet"
)
# let's deal with first
# first initalize the item or pull if it already exists
item = None
for ix, x in enumerate(reactionDefinition["complexDefinition"]):
if x[0] == first:
item = reactionDefinition["complexDefinition"].pop(ix)
break
if item is None:
item = [first, [[first]]]
item[1][0].append(second.lower())
item[1][0].append([])
reactionDefinition["complexDefinition"].append(item)
if first != second:
# now deal with second partner
# first initalize the item or pull if it already exists
item = None
for ix, x in enumerate(
reactionDefinition["complexDefinition"]
):
if x[0] == second:
item = reactionDefinition["complexDefinition"].pop(
ix
)
break
if item is None:
item = [second, [[second]]]
item[1][0].append(first.lower())
item[1][0].append([])
reactionDefinition["complexDefinition"].append(item)
else:
reactionDefinition["complexDefinition"] = []
# now deal with reaction definition block
if "reactionDefinition" in reactionDefinition_new:
reactionDefinition["reactionDefinition"] = []
# convert new JSON format to old data format
else:
reactionDefinition["reactionDefinition"] = []
# deal with modifications
if "modificationDefinition" in reactionDefinition_new:
# TODO: Change file format to be nicer?
reactionDefinition[
"modificationDefinition"
] = reactionDefinition_new["modificationDefinition"]
# convert new JSON format to old data format
else:
reactionDefinition["modificationDefinition"] = {}
return reactionDefinition
reactionDefinition = {
"reactions": [
[["S0", "S1"], ["S2"]],
[["S2"], ["S0", "S1"]],
[[], ["S0"]],
[["S0"], []],
[["S0", "S1", "S2"], ["S3"]],
[["S3"], ["S0", "S1", "S2"]],
],
"reactionsNames": [
"Binding",
"Binding",
"Binding",
"Binding",
"Generation",
"Decay",
"Phosporylation",
"Double-Phosporylation",
"iMod",
"mMod",
"Ubiquitination",
],
"definitions": [
[{"r": [0]}, {"n": []}],
[{"r": [1]}, {"n": []}],
[{"r": [4]}, {"n": []}],
[{"r": [5]}, {"n": []}],
[{"r": [2]}],
[{"r": [3]}],
[{"n": [0]}],
[{"n": []}],
[{"n": []}],
[{"n": []}],
[{"n": []}],
],
}
return reactionDefinition
[docs] def identifyReactions2(self, rule, reactionDefinition):
"""
This method goes through the list of common reactions listed in ruleDictionary
and tries to find how are they related according to the information in reactionDefinition
"""
result = []
for idx, element in enumerate(reactionDefinition["reactions"]):
tmp1 = rule[0] if rule[0] not in ["0", ["0"]] else []
tmp2 = rule[1] if rule[1] not in ["0", ["0"]] else []
if len(tmp1) == len(element[0]) and len(tmp2) == len(element[1]):
result.append(1)
# for (el1,el2) in (element[0],rule[0]):
# if element[0].count(el1) == element[]
else:
result.append(0)
return result
[docs] def species2Rules(self, rules):
"""
This method goes through the rule list and classifies species tuples in a dictionary
according to the reactions they appear in.
"""
ruleDictionary = {}
for idx, rule in enumerate(rules):
reaction2 = rule # list(parseReactions(rule))
totalElements = [item for sublist in reaction2 for item in sublist]
if tuple(totalElements) in ruleDictionary:
ruleDictionary[tuple(totalElements)].append(idx)
else:
ruleDictionary[tuple(totalElements)] = [idx]
return ruleDictionary
[docs] def checkCompliance(self, ruleCompliance, tupleCompliance, ruleBook):
"""
This method is mainly useful when a single reaction can be possibly classified
in different ways, but in the context of its tuple partners it can only be classified
as one
"""
ruleResult = np.zeros(len(ruleBook))
for validTupleIndex in np.nonzero(tupleCompliance):
for index in validTupleIndex:
for alternative in ruleBook[index]:
if "r" in alternative and np.any(
[ruleCompliance[temp] for temp in alternative["r"]]
):
ruleResult[index] = 1
break
# check if just this is enough
if "n" in alternative:
ruleResult[index] = 1
break
return ruleResult
[docs] def levenshtein(self, s1, s2):
l1 = len(s1)
l2 = len(s2)
matrix = [list(range(l1 + 1))] * (l2 + 1)
for zz in range(l2 + 1):
matrix[zz] = list(range(zz, zz + l1 + 1))
for zz in range(0, l2):
for sz in range(0, l1):
if s1[sz] == s2[zz]:
matrix[zz + 1][sz + 1] = min(
matrix[zz + 1][sz] + 1, matrix[zz][sz + 1] + 1, matrix[zz][sz]
)
else:
matrix[zz + 1][sz + 1] = min(
matrix[zz + 1][sz] + 1,
matrix[zz][sz + 1] + 1,
matrix[zz][sz] + 1,
)
return matrix[l2][l1]
[docs] def analyzeUserDefinedEquivalences(self, molecules, conventions):
equivalences = {}
smolecules = [x.strip("()") for x in molecules]
modifiedElement = {}
for convention in conventions:
baseMol = []
modMol = []
for molecule in smolecules:
if convention[0] in molecule and convention[1] not in molecule:
baseMol.append(molecule)
elif convention[1] in molecule:
modMol.append(molecule)
if convention[2] not in equivalences:
equivalences[convention[2]] = []
equivalences[convention[2]].append((convention[0], convention[1]))
if convention[0] not in modifiedElement:
modifiedElement[convention[0]] = []
modifiedElement[convention[0]].append((convention[0], convention[1]))
"""
for mol1 in baseMol:
for mol2 in modMol:
score = self.levenshtein(mol1,mol2)
if score == self.levenshtein(convention[0],convention[1]):
equivalences[convention[2]].append((mol1,mol2))
modifiedElement[convention[0]].append((mol1,mol2))
break
"""
return equivalences, modifiedElement
[docs] def processNamingConventions2(self, molecules, threshold=4, onlyUser=False):
# normal naming conventions
strippedMolecules = [x.strip("()") for x in molecules]
tmpTranslator = {}
translationKeys = []
conventionDict = {}
# FIXME: This line contains the single biggest execution bottleneck in the code
# we should be able to delete it
# user defined equivalence
if not onlyUser:
(
tmpTranslator,
translationKeys,
conventionDict,
) = detectOntology.analyzeNamingConventions(
strippedMolecules, self.namingConventions, similarityThreshold=threshold
)
# user defined naming convention
if self.userEquivalencesDict is None and hasattr(self, "userEquivalences"):
(
self.userEquivalencesDict,
self.modifiedElementDictionary,
) = self.analyzeUserDefinedEquivalences(molecules, self.userEquivalences)
else:
if self.userEquivalencesDict is None:
self.userEquivalencesDict = {}
"""
for name in self.userEquivalencesDict:
equivalenceTranslator[name] = self.userEquivalencesDict[name]
"""
# add stuff to the main translator
for element in self.userEquivalencesDict:
if element not in tmpTranslator:
tmpTranslator[element] = []
tmpTranslator[element].extend(self.userEquivalencesDict[element])
return tmpTranslator, translationKeys, conventionDict
[docs] def processAdHocNamingConventions(
self, reactant, product, localSpeciesDict, compartmentChangeFlag, moleculeSet
):
"""
1-1 string comparison. This method will attempt to detect if there's
a modifiation relatinship between string <reactant> and <product>
>>> sa = SBMLAnalyzer(None,'./config/reactionDefinitions.json','./config/namingConventions.json')
>>> sa.processAdHocNamingConventions('EGF_EGFR_2','EGF_EGFR_2_P', {}, False, ['EGF','EGFR', 'EGF_EGFR_2'])
[[[['EGF_EGFR_2'], ['EGF_EGFR_2_P']], '_p', ('+ _', '+ p')]]
>>> sa.processAdHocNamingConventions('A', 'A_P', {}, False,['A','A_P']) #changes neeed to be at least 3 characters long
[[[['A'], ['A_P']], None, None]]
>>> sa.processAdHocNamingConventions('Ras_GDP', 'Ras_GTP', {}, False,['Ras_GDP','Ras_GTP', 'Ras'])
[[[['Ras'], ['Ras_GDP']], '_gdp', ('+ _', '+ g', '+ d', '+ p')], [[['Ras'], ['Ras_GTP']], '_gtp', ('+ _', '+ g', '+ t', '+ p')]]
>>> sa.processAdHocNamingConventions('cRas_GDP', 'cRas_GTP', {}, False,['cRas_GDP','cRas_GTP'])
[[[['cRas'], ['cRas_GDP']], '_gdp', ('+ _', '+ g', '+ d', '+ p')], [[['cRas'], ['cRas_GTP']], '_gtp', ('+ _', '+ g', '+ t', '+ p')]]
"""
# strippedMolecules = [x.strip('()') for x in molecules]
molecules = (
[reactant, product] if len(reactant) < len(product) else [product, reactant]
)
similarityThreshold = 10
if reactant == product:
return [[[[reactant], [product]], None, None]]
namePairs, differenceList, _ = detectOntology.defineEditDistanceMatrix(
molecules, similarityThreshold=similarityThreshold
)
# print('+++',namePairs,differenceList)
# print('---',detectOntology.defineEditDistanceMatrix2(molecules,similarityThreshold=similarityThreshold))
# FIXME:in here we need a smarter heuristic to detect actual modifications
# for now im just going with a simple heuristic that if the species name
# is long enough, and the changes from a to be are all about modification
longEnough = 3
if len(differenceList) > 0 and (
(len(reactant) >= longEnough and len(reactant) >= len(differenceList[0]))
or reactant in moleculeSet
):
# one is strictly a subset of the other a,a_b
if len([x for x in differenceList[0] if "-" in x]) == 0:
return [
[
[[reactant], [product]],
"".join([x[-1] for x in differenceList[0]]),
differenceList[0],
]
]
# string share a common subset but they contain mutually exclusive appendixes: a_b,a_c
else:
commonRoot = detectOntology.findLongestSubstring(reactant, product)
if len(commonRoot) > longEnough or commonRoot in moleculeSet:
# find if we can find a commonRoot from existing molecules
mostSimilarRealMolecules = get_close_matches(
commonRoot,
[x for x in moleculeSet if x not in [reactant, product]],
)
for commonMolecule in mostSimilarRealMolecules:
if commonMolecule in reactant and commonMolecule in product:
commonRoot = commonMolecule
logMess(
"DEBUG:LAE003",
"common root {0}={1}:{2}".format(
commonRoot, reactant, product
),
)
# if commonMolecule == commonRoot.strip('_'):
# commonRoot= commonMolecule
# break
molecules = [commonRoot, reactant, product]
(
namePairs,
differenceList,
_,
) = detectOntology.defineEditDistanceMatrix(
[commonRoot, reactant], similarityThreshold=10
)
(
namePairs2,
differenceList2,
_,
) = detectOntology.defineEditDistanceMatrix(
[commonRoot, product], similarityThreshold=10
)
namePairs.extend(namePairs2)
# print(namePairs, reactant, product)
# XXX: this was just turning the heuristic off
# for element in namePairs:
# supposed modification is actually a pre-existing species. if that happens then refuse to proceeed
# if element[1] in moleculeSet:
# return [[[[reactant],[product]],None,None]]
differenceList.extend(differenceList2)
# obtain the name of the component from an anagram using the modification letters
validDifferences = [
"".join([x[-1] for x in difference])
for difference in differenceList
if "-" not in [y[0] for y in difference]
]
validDifferences.sort()
# avoid trivial differences
if len(validDifferences) < 2 or any(
[x in moleculeSet for x in validDifferences]
):
return [[[[reactant], [product]], None, None]]
# FIXME:here it'd be helpful to come up with a better heuristic
# for infered component names
# componentName = ''.join([x[0:max(1,int(math.ceil(len(x)/2.0)))] for x in validDifferences])
# for namePair,difference in zip(namePairs,differenceList):
# if len([x for x in difference if '-' in x]) == 0:
# tag = ''.join([x[-1] for x in difference])
# if [namePair[0],tag] not in localSpeciesDict[commonRoot][componentName]:
# localSpeciesDict[namePair[0]][componentName].append([namePair[0],tag,compartmentChangeFlag])
# localSpeciesDict[namePair[1]][componentName].append([namePair[0],tag,compartmentChangeFlag])
# namePairs,differenceList,_ = detectOntology.defineEditDistanceMatrix([commonRoot,product],
#
# similarityThreshold=similarityThreshold)
return [
[
[[namePairs[y][0]], [namePairs[y][1]]],
"".join([x[-1] for x in differenceList[y]]),
differenceList[y],
]
for y in range(len(differenceList))
]
return [[[[reactant], [product]], None, None]]
[docs] def compareStrings(self, reactant, product, strippedMolecules):
if reactant in strippedMolecules:
if reactant in product:
return reactant, [reactant]
# pairedMolecules.append((reactant[idx],reactant[idx]))
# product.remove(reactant[idx])
# reactant.remove(reactant[idx])
else:
closeMatch = get_close_matches(reactant, product)
if len(closeMatch) == 1:
# pairedMolecules.append((reactant[idx],closeMatch[0]))
# product.remove(closeMatch[0])
# reactant.remove(reactant[idx])
return (reactant, closeMatch)
elif len(closeMatch) > 0:
s = difflib.SequenceMatcher()
s.set_seq1(reactant)
scoreDictionary = []
for match in closeMatch:
s.set_seq2(match)
scoreDictionary.append((s.ratio(), match))
scoreDictionary.sort(reverse=True)
return reactant, [closeMatch[0]]
else:
return None, []
else:
if reactant not in product:
closeMatch = get_close_matches(reactant, product)
if len(closeMatch) == 1:
if closeMatch[0] in strippedMolecules:
return reactant, closeMatch
else:
closeMatchToBaseMolecules = get_close_matches(
closeMatch[0], strippedMolecules
)
if len(closeMatchToBaseMolecules) == 1:
return reactant, closeMatch
return None, closeMatch
# pairedMolecules.append((reactant[idx],closeMatch[0]))
# product.remove(closeMatch[0])
# reactant.remove(reactant[idx])
else:
return None, closeMatch
# print('****',reactant[idx],closeMatch,difflib.get_close_matches(reactant[idx],strippedMolecules))
else:
mcloseMatch = get_close_matches(reactant, strippedMolecules)
# for close in mcloseMatch:
# if close in [x for x in reaction[0]]:
# return None,[close]
return None, [reactant]
[docs] def growString(
self, reactant, product, rp, pp, idx, strippedMolecules, continuityFlag
):
"""
currently this is the slowest method in the system because of all those calls to difflib
"""
idx2 = 2
treactant = [rp]
tproduct = pp
pidx = product.index(pp[0])
# print(reactant,rself.breakByActionableUnit([reactant,product],strippedMolecules))
while idx + idx2 <= len(reactant):
treactant2 = reactant[idx : min(len(reactant), idx + idx2)]
# if treactant2 != tproduct2:
if treactant2[-1] in strippedMolecules and continuityFlag:
break
else:
if len(reactant) > idx + idx2:
tailDifferences = get_close_matches(
treactant2[-1], strippedMolecules
)
if len(tailDifferences) > 0:
tdr = max(
[0]
+ [
sequenceMatcher("_".join(treactant2), x)
for x in tailDifferences
]
)
hdr = max(
[0]
+ [
sequenceMatcher(
"_".join(reactant[idx + idx2 - 1 : idx + idx2 + 1]),
x,
)
for x in tailDifferences
]
)
if tdr > hdr and tdr > 0.8:
treactant = treactant2
else:
tailDifferences = get_close_matches(
"_".join(treactant2), strippedMolecules
)
headDifferences = get_close_matches(
"_".join(reactant[idx + idx2 - 1 : idx + idx2 + 1]),
strippedMolecules,
)
if len(tailDifferences) == 0:
break
elif len(headDifferences) == 0:
treactant = treactant2
break
elif len(reactant) == idx + idx2:
tailDifferences = get_close_matches(
"_".join(treactant2), strippedMolecules
)
if len(tailDifferences) > 0:
tdr = max(
[0]
+ [
sequenceMatcher("_".join(treactant2), x)
for x in tailDifferences
]
)
if tdr > 0.8:
treactant = treactant2
else:
break
else:
break
else:
treactant = treactant2
break
idx2 += 1
idx2 = 2
while pidx + idx2 <= len(product):
tproduct2 = product[pidx : min(len(product), pidx + idx2)]
if tproduct2[-1] in strippedMolecules and continuityFlag:
break
else:
if len(product) > pidx + idx2:
tailDifferences = get_close_matches(
tproduct2[-1], strippedMolecules
)
if len(tailDifferences) > 0:
tdr = max(
[0]
+ [
sequenceMatcher("_".join(tproduct2), x)
for x in tailDifferences
]
)
hdr = max(
[0]
+ [
sequenceMatcher(
"_".join(
product[pidx + idx2 - 1 : pidx + idx2 + 1]
),
x,
)
for x in tailDifferences
]
)
if tdr > hdr and tdr > 0.8:
tproduct = tproduct2
else:
tailDifferences = get_close_matches(
"_".join(tproduct2), strippedMolecules, cutoff=0.8
)
headDifferences = get_close_matches(
"_".join(product[pidx + idx2 - 1 : pidx + idx2 + 1]),
strippedMolecules,
cutoff=0.8,
)
if len(tailDifferences) == 0:
break
elif (
len(headDifferences) == 0
or "_".join(tproduct2) in tailDifferences
):
tproduct = tproduct2
elif len(product) == pidx + idx2:
tailDifferences = get_close_matches(
"_".join(tproduct2), strippedMolecules
)
if len(tailDifferences) > 0:
tdr = max(
[0]
+ [
sequenceMatcher("_".join(tproduct2), x)
for x in tailDifferences
]
)
if tdr > 0.8:
tproduct = tproduct2
else:
break
else:
break
else:
tproduct = tproduct2
break
# if '_'.join(tproduct2) in strippedMolecules and '_'.join(treactant2) in strippedMolecules:
# tproduct = tproduct2
# treactant = treactant2
# else:
idx2 += 1
return treactant, tproduct
[docs] def approximateMatching2(
self, reactantString, productString, strippedMolecules, differenceParameter
):
"""
The meat of the naming convention matching between reactant and product is done here
tl;dr naming conventions are hard
"""
# reactantString = [x.split('_') for x in reaction[0]]
# reactantString = [[y for y in x if y!=''] for x in reactantString]
# productString = [x.split('_') for x in reaction[1]]
# productString = [[y for y in x if y!=''] for x in productString]
pairedMolecules = [[] for _ in range(len(productString))]
pairedMolecules2 = [[] for _ in range(len(reactantString))]
for stoch, reactant in enumerate(reactantString):
idx = -1
while idx + 1 < len(reactant):
idx += 1
for stoch2, product in enumerate(productString):
# print(idx2,product in enumerate(element3):)
rp, pp = self.compareStrings(
reactant[idx], product, strippedMolecules
)
if rp and rp != pp[0]:
pairedMolecules[stoch2].append((rp, pp[0]))
pairedMolecules2[stoch].append((pp[0], rp))
product.remove(pp[0])
reactant.remove(rp)
# product.remove(pp)
# reactant.remove(rp)
idx = -1
break
elif rp:
treactant, tproduct = self.growString(
reactant,
product,
rp,
pp,
idx,
strippedMolecules,
continuityFlag=True,
)
if "_".join(treactant) in strippedMolecules:
finalReactant = "_".join(treactant)
else:
reactantMatches = get_close_matches(
"_".join(treactant), strippedMolecules
)
if len(reactantMatches) > 0:
reactantScore = [
sequenceMatcher(
"".join(treactant), x.replace("_", "")
)
for x in reactantMatches
]
finalReactant = reactantMatches[
reactantScore.index(max(reactantScore))
]
else:
finalReactant = "_".join(treactant)
if "_".join(tproduct) in strippedMolecules:
finalProduct = "_".join(tproduct)
else:
productMatches = get_close_matches(
"_".join(tproduct), strippedMolecules
)
if len(productMatches) > 0:
productScore = [
sequenceMatcher(
"".join(tproduct), x.replace("_", "")
)
for x in productMatches
]
finalProduct = productMatches[
productScore.index(max(productScore))
]
else:
finalProduct = "_".join(tproduct)
pairedMolecules[stoch2].append((finalReactant, finalProduct))
pairedMolecules2[stoch].append((finalProduct, finalReactant))
for x in treactant:
reactant.remove(x)
for x in tproduct:
product.remove(x)
idx = -1
break
else:
flag = False
if pp not in [[], None]:
# if reactant[idx] == pp[0]:
treactant, tproduct = self.growString(
reactant,
product,
reactant[idx],
pp,
idx,
strippedMolecules,
continuityFlag=False,
)
# FIXME: this comparison is pretty nonsensical. treactant and tproduct are not
# guaranteed to be in teh right order. why are we comparing them both at the same time
if (
len(treactant) > 1
and "_".join(treactant) in strippedMolecules
) or (
len(tproduct) > 1
and "_".join(tproduct) in strippedMolecules
):
pairedMolecules[stoch2].append(
("_".join(treactant), "_".join(tproduct))
)
pairedMolecules2[stoch].append(
("_".join(tproduct), "_".join(treactant))
)
for x in treactant:
reactant.remove(x)
for x in tproduct:
product.remove(x)
idx = -1
break
else:
rclose = get_close_matches(
"_".join(treactant), strippedMolecules
)
pclose = get_close_matches(
"_".join(tproduct), strippedMolecules
)
rclose2 = [x.split("_") for x in rclose]
rclose2 = [
"_".join([y for y in x if y != ""]) for x in rclose2
]
pclose2 = [x.split("_") for x in pclose]
pclose2 = [
"_".join([y for y in x if y != ""]) for x in pclose2
]
trueReactant = None
trueProduct = None
try:
trueReactant = rclose[
rclose2.index("_".join(treactant))
]
trueProduct = pclose[
pclose2.index("_".join(tproduct))
]
except:
pass
if trueReactant and trueProduct:
pairedMolecules[stoch2].append(
(trueReactant, trueProduct)
)
pairedMolecules2[stoch].append(
(trueProduct, trueReactant)
)
for x in treactant:
reactant.remove(x)
for x in tproduct:
product.remove(x)
idx = -1
break
if (
sum(len(x) for x in reactantString + productString) > 0
and self.conservationOfMass
):
return None, None
else:
return pairedMolecules, pairedMolecules2
[docs] def approximateMatching(self, ruleList, differenceParameter=[]):
def curateString(
element,
differences,
symbolList=["#", "&", ";", "@", "!", "?"],
equivalenceDict={},
):
"""
remove compound differencese (>2 characters) and instead represent them with symbols
returns transformed string,an equivalence dictionary and unused symbols
"""
tmp = element
for difference in differences:
if difference in element:
if difference.startswith("_"):
if difference not in equivalenceDict:
symbol = symbolList.pop()
equivalenceDict[difference] = symbol
else:
symbol = equivalenceDict[difference]
tmp = re.sub(
r"{0}(_|$)".format(difference), r"{0}\1".format(symbol), tmp
)
elif difference.endswith("_"):
if difference not in equivalenceDict:
symbol = symbolList.pop()
equivalenceDict[difference] = symbol
else:
symbol = equivalenceDict[difference]
tmp = re.sub(
r"(_|^){0}".format(difference), r"{0}\1".format(symbol), tmp
)
return tmp, symbolList, equivalenceDict
"""
given a transformation of the kind a+ b -> ~a_~b, where ~a and ~b are some
slightly modified version of a and b, this function will return a list of
lexical changes that a and b must undergo to become ~a and ~b.
"""
flag = True
if len(ruleList[1]) == 1 and ruleList[1] != "0":
differences = deepcopy(differenceParameter)
tmpRuleList = deepcopy(ruleList)
while flag:
flag = False
sym = ["#", "&", ";", "@", "!", "?"]
dic = {}
for idx, _ in enumerate(tmpRuleList[0]):
tmpRuleList[0][idx], sym, dic = curateString(
ruleList[0][idx], differences, sym, dic
)
tmpRuleList[1][0], sym, dic = curateString(
ruleList[1][0], differences, sym, dic
)
permutations = [x for x in itertools.permutations(ruleList[0])]
tpermutations = [x for x in itertools.permutations(tmpRuleList[0])]
score = [
difflib.SequenceMatcher(None, "_".join(x), ruleList[1][0]).ratio()
for x in permutations
]
maxindex = score.index(max(score))
ruleList[0] = list(permutations[maxindex])
tmpRuleList[0] = list(tpermutations[maxindex])
sym = [dic[x] for x in dic]
sym.extend(differences)
sym = [x for x in sym if "_" not in x]
simplifiedDifference = difflib.SequenceMatcher(
lambda x: x in sym, "-".join(tmpRuleList[0]), tmpRuleList[1][0]
)
matches = simplifiedDifference.get_matching_blocks()
if len(matches) != len(ruleList[0]) + 1:
return [[], []], [[], []]
productPartitions = []
for idx, match in enumerate(matches):
if matches[idx][2] != 0:
productPartitions.append(
tmpRuleList[1][0][
matches[idx][1] : matches[idx][1] + matches[idx][2]
]
)
reactantPartitions = tmpRuleList[0]
# Don't count trailing underscores as part of the species name
for idx, _ in enumerate(reactantPartitions):
reactantPartitions[idx] = reactantPartitions[idx].strip("_")
for idx, _ in enumerate(productPartitions):
productPartitions[idx] = productPartitions[idx].strip("_")
# greedymatching
acc = 0
# FIXME:its not properly copying all the string
for idx in range(0, len(matches) - 1):
while (
matches[idx][2] + acc < len(tmpRuleList[1][0])
and tmpRuleList[1][0][matches[idx][2] + acc] in sym
):
productPartitions[idx] += tmpRuleList[1][0][
matches[idx][2] + acc
]
acc += 1
# idx = 0
# while(tmpString[matches[0][2]+ idx] in sym):
# reactantfirstHalf += tmpString[matches[0][2] + idx]
# idx += 1
for element in dic:
for idx in range(len(productPartitions)):
productPartitions[idx] = productPartitions[idx].replace(
dic[element], element
)
reactantPartitions[idx] = reactantPartitions[idx].replace(
dic[element], element
)
zippedPartitions = list(zip(reactantPartitions, productPartitions))
zippedPartitions = [sorted(x, key=len) for x in zippedPartitions]
bdifferences = [
[z for z in y if "+ " in z or "- " in z]
for y in [difflib.ndiff(*x) for x in zippedPartitions]
]
processedDifferences = [
"".join([y.strip("+ ") for y in x]) for x in bdifferences
]
for idx, processedDifference in enumerate(processedDifferences):
if (
processedDifference not in differences
and "- " not in processedDifference
and bdifferences[idx] != []
):
flag = True
differences.append(processedDifference)
else:
# TODO: dea with reactions of the kindd a+b -> c + d
return [[], []], [[], []]
return bdifferences, zippedPartitions
[docs] def getReactionClassification(
self,
reactionDefinition,
rules,
equivalenceTranslator,
indirectEquivalenceTranslator,
translationKeys=[],
):
"""
*reactionDefinition* is a list of conditions that must be met for a reaction
to be classified a certain way
*rules* is the list of reactions
*equivalenceTranslator* is a dictinary containing all complexes that have been
determined to be the same through naming conventions
This method will go through the list of rules and the list of rule definitions
and tell us which rules it can classify according to the rule definitions list
provided
"""
ruleDictionary = self.species2Rules(rules)
# determines a reaction's reactionStructure aka stoichoimetry
ruleComplianceMatrix = np.zeros(
(len(rules), len(reactionDefinition["reactions"]))
)
for idx, rule in enumerate(rules):
reaction2 = rule # list(parseReactions(rule))
ruleComplianceMatrix[idx] = self.identifyReactions2(
reaction2, reactionDefinition
)
# initialize the tupleComplianceMatrix array with the same keys as ruleDictionary
# the tuple complianceMatrix is basically there to make sure we evaluate
# bidirectional reactions as one reaction
tupleComplianceMatrix = {
key: np.zeros((len(reactionDefinition["reactions"])))
for key in ruleDictionary
}
# check which reaction conditions each tuple satisfies
for element in ruleDictionary:
for rule in ruleDictionary[element]:
tupleComplianceMatrix[element] += ruleComplianceMatrix[rule]
# now we will check for the nameConventionMatrix (same thing as before but for naming conventions)
tupleNameComplianceMatrix = {
key: {key2: 0 for key2 in equivalenceTranslator} for key in ruleDictionary
}
for rule in ruleDictionary:
for namingConvention in equivalenceTranslator:
for equivalence in equivalenceTranslator[namingConvention]:
if all(element in rule for element in equivalence):
tupleNameComplianceMatrix[rule][namingConvention] += 1
break
for equivalence in indirectEquivalenceTranslator[namingConvention]:
if all(element in rule for element in equivalence[0]):
tupleNameComplianceMatrix[rule][namingConvention] += 1
break
# we can have more than one
# elif appro
# check if the reaction conditions each tuple satisfies are enough to get classified
# as an specific named reaction type
tupleDefinitionMatrix = {
key: np.zeros((len(reactionDefinition["definitions"])))
for key in ruleDictionary
}
for key, element in list(tupleComplianceMatrix.items()):
for idx, member in enumerate(reactionDefinition["definitions"]):
for alternative in member:
if "r" in alternative:
tupleDefinitionMatrix[key][idx] += np.all(
[element[reaction] for reaction in alternative["r"]]
)
if (
"n" in alternative
and reactionDefinition["reactionsNames"][idx]
in equivalenceTranslator
):
tupleDefinitionMatrix[key][idx] += np.all(
[
tupleNameComplianceMatrix[key][
reactionDefinition["reactionsNames"][idx]
]
]
)
# cotains which rules are equal to reactions defined in reactionDefinitions['definitions']
# use the per tuple classification to obtain a per reaction classification
ruleDefinitionMatrix = np.zeros(
(len(rules), len(reactionDefinition["definitions"]))
)
for key, element in list(ruleDictionary.items()):
for rule in element:
ruleDefinitionMatrix[rule] = self.checkCompliance(
ruleComplianceMatrix[rule],
tupleDefinitionMatrix[key],
reactionDefinition["definitions"],
)
# use reactionDefinitions reactionNames field to actually tell us what reaction
# type each reaction is
results = []
for idx, element in enumerate(ruleDefinitionMatrix):
nonZero = np.nonzero(element)[0]
if len(nonZero) == 0:
results.append("None")
# todo: need to do something if it matches more than one reaction
else:
classifications = [
reactionDefinition["reactionsNames"][x] for x in nonZero
]
# FIXME: we should be able to support more than one transformation
results.append(classifications[0])
return results
[docs] def setConfigurationFile(self, configurationFile):
self.configurationFile = configurationFile
[docs] def getReactionProperties(self):
"""
if we are using a naming convention definition in the json file
this method will return the component and state names that this
reaction uses
"""
# TODO: once we transition completely to a naming convention delete
# this ----
reactionTypeProperties = {}
reactionDefinition = self.loadConfigFiles(self.configurationFile)
if self.speciesEquivalences != None:
self.userEquivalences = self.loadConfigFiles(self.speciesEquivalences)[
"reactionDefinition"
]
for reactionType, properties in zip(
reactionDefinition["reactionsNames"], reactionDefinition["definitions"]
):
# if its a reaction defined by its naming convention
# xxxxxxxxxxxxxxxxxxx
for alternative in properties:
if "n" in list(alternative.keys()):
try:
site = reactionDefinition["reactionSite"][alternative["rsi"]]
state = reactionDefinition["reactionState"][alternative["rst"]]
except:
site = reactionType
state = reactionType[0]
reactionTypeProperties[reactionType] = [site, state]
# TODO: end of delete
reactionDefinition = self.namingConventions
for idx, reactionType in enumerate(reactionDefinition["modificationList"]):
site = reactionDefinition["reactionSite"][
reactionDefinition["definitions"][idx]["rsi"]
]
state = reactionDefinition["reactionState"][
reactionDefinition["definitions"][idx]["rst"]
]
reactionTypeProperties[reactionType] = [site, state]
return reactionTypeProperties
[docs] def processFuzzyReaction(
self, reaction, translationKeys, conventionDict, indirectEquivalenceTranslator
):
differences, pairedChemicals = self.approximateMatching(
reaction, translationKeys
)
# matching,matching2 = self.approximateMatching2(reaction,strippedMolecules,
# translationKeys)
d1, d2 = differences[0], differences[1]
firstMatch, secondMatch = pairedChemicals[0], pairedChemicals[1]
matches = [firstMatch, secondMatch]
for index, element in enumerate([d1, d2]):
idx1 = 0
idx2 = 1
while idx2 <= len(element):
if (element[idx1],) in list(conventionDict.keys()):
pattern = conventionDict[(element[idx1],)]
indirectEquivalenceTranslator[pattern].append(
[
[reaction[0][index], reaction[1][0]],
reaction[0],
matches[index],
reaction[1],
]
)
elif (element[idx1].replace("-", "+"),) in list(conventionDict.keys()):
matches[index].reverse()
transformedPattern = conventionDict[
(element[idx1].replace("-", "+"),)
]
indirectEquivalenceTranslator[transformedPattern].append(
[
[reaction[1][0], reaction[0][index]],
reaction[0],
matches[index],
reaction[1],
]
)
elif idx2 < len(element):
if tuple([element[idx1], element[idx2]]) in list(
conventionDict.keys()
):
pattern = conventionDict[tuple([element[idx1], element[idx2]])]
indirectEquivalenceTranslator[pattern].append(
[
[reaction[0][index], reaction[1][0]],
reaction[0],
matches[index],
reaction[1],
]
)
idx1 += 1
idx2 += 1
elif "-" in element[idx1] and "-" in element[idx2]:
if tuple(
[
element[idx1].replace("-", "+"),
element[idx2].replace("-", "+"),
]
) in list(conventionDict.keys()):
matches[index].reverse()
transformedPattern = conventionDict[
tuple(
[
element[idx1].replace("-", "+"),
element[idx2].replace("-", "+"),
]
)
]
indirectEquivalenceTranslator[transformedPattern].append(
[
[reaction[1][0], reaction[0][index]],
reaction[0],
matches[index],
reaction[1],
]
)
idx1 += 1
idx2 += 1
idx1 += 1
idx2 += 1
[docs] def removeExactMatches(self, reactantList, productList):
"""
goes through the list of lists reactantList and productList and removes the intersection
"""
reactantFlat = Counter([y for x in reactantList for y in x])
productFlat = Counter([y for x in productList for y in x])
intersection = reactantFlat & productFlat
intersection2 = deepcopy(intersection)
newReactant = []
newProduct = []
for chemical in reactantList:
tmp = []
for element in chemical:
if intersection[element] > 0:
intersection[element] -= 1
else:
tmp.append(element)
newReactant.append(tmp)
for chemical in productList:
tmp = []
for element in chemical:
if intersection2[element] > 0:
intersection2[element] -= 1
else:
tmp.append(element)
newProduct.append(tmp)
return newReactant, newProduct
[docs] def findBiggestActionable(self, chemicalList, chemicalCandidatesList):
actionableList = []
for chemical, chemicalCandidates in zip(chemicalList, chemicalCandidatesList):
if len(chemicalCandidates) == 0:
return None
if len(chemicalCandidates) == 1:
actionableList.append([chemical])
continue
# find all combinations
scoreDict = []
result = 0
try:
for i in range(1, len(chemicalCandidates) + 1):
combinations = list(itertools.permutations(chemicalCandidates, i))
for x in combinations:
score = difflib.SequenceMatcher(
None, "_".join(x), chemical
).quick_ratio()
if score == 1:
result = x
raise IOError
elif score > 0:
scoreDict.append([x, score])
except IOError:
scoreDict = [[result, 1.0]]
scoreDict.sort(key=lambda x: [x[1], -len(x[0])], reverse=True)
if len(scoreDict) > 0:
actionableList.append(list(scoreDict[0][0]))
else:
print(actionableList)
raise Exception
return actionableList
[docs] def breakByActionableUnit(self, reaction, strippedMolecules):
# find valid actionable units from the list of molecules in the system
validCandidatesReactants = [
[y for y in strippedMolecules if y in x] for x in reaction[0]
]
validCandidatesProducts = [
[y for y in strippedMolecules if y in x] for x in reaction[1]
]
# find the subset of intersection parts between reactants and products
intermediateVector = [
list(
filter(
lambda x: any(
[
len(
[
z
for z in difflib.ndiff(x, y)
if "+" in z[0] or "-" in z[0]
]
)
<= 3
for z in validCandidatesProducts
for y in z
]
),
sublist,
)
)
for sublist in validCandidatesReactants
]
intermediateVector = [
list(
filter(
lambda x: any(
[
len(
[
z
for z in difflib.ndiff(x, y)
if "+" in z[0] or "-" in z[0]
]
)
<= 3
for z in intermediateVector
for y in z
]
),
sublist,
)
)
for sublist in validCandidatesProducts
]
tmpReactant = [
[
list(
filter(
lambda y: len([x for x in intermediateVector[0] if y in x])
== 1,
reactant,
)
)
]
for reactant in validCandidatesReactants
]
tmpProduct = [
[
list(
filter(
lambda y: len([x for x in intermediateVector[0] if y in x])
== 1,
reactant,
)
)
]
for reactant in validCandidatesProducts
]
# print(validCandidatesReactants,validCandidatesProducts,intermediateVector)
# print('......',reaction)
# print('\t......',validCandidatesReactants,validCandidatesProducts)
# find biggest subset of actionable units
reactantList = self.findBiggestActionable(reaction[0], validCandidatesReactants)
productList = self.findBiggestActionable(reaction[1], validCandidatesProducts)
# print('\t\t+++++',reactantList,productList)
return reactantList, productList
[docs] def testAgainstExistingConventions(self, fuzzyKey, modificationList, threshold=4):
@memoize
def testAgainstExistingConventionsHelper(fuzzyKey, modificationList, threshold):
if not fuzzyKey:
return None
for i in range(1, threshold):
combinations = itertools.permutations(modificationList, i)
validKeys = list(
filter(
lambda x: ("".join(x)).upper() == fuzzyKey.upper(), combinations
)
)
if validKeys:
return validKeys
return None
return testAgainstExistingConventionsHelper(
fuzzyKey, modificationList, threshold
)
[docs] def classifyReactions(self, reactions, molecules, externalDependencyGraph={}):
"""
classifies a group of reaction according to the information in the json
config file
FIXME:classifiyReactions function is currently the biggest bottleneck in atomizer, taking up
to 80% of the time without counting pathwaycommons querying.
"""
def createArtificialNamingConvention(reaction, fuzzyKey, fuzzyDifference):
"""
Does the actual data-structure filling if
a 1-1 reaction shows sign of modification. Returns True if
a change was performed
"""
# fuzzyKey,fuzzyDifference = self.processAdHocNamingConventions(reaction[0][0],reaction[1][0],localSpeciesDict,compartmentChangeFlag)
if fuzzyKey and fuzzyKey.strip("_").lower() not in [
x.lower() for x in strippedMolecules
]:
# if our state isnt yet on the dependency graph preliminary data structures
if "{0}".format(fuzzyKey) not in equivalenceTranslator:
# print('---','{0}'.format(fuzzyKey),equivalenceTranslator.keys())
# check if there is a combination of existing keys that deals with this modification without the need of creation a new one
if self.testAgainstExistingConventions(
fuzzyKey, self.namingConventions["modificationList"]
):
logMess(
"INFO:LAE005",
"added relationship through existing convention in reaction {0}".format(
str(reaction)
),
)
if "{0}".format(fuzzyKey) not in equivalenceTranslator:
equivalenceTranslator["{0}".format(fuzzyKey)] = []
if "{0}".format(fuzzyKey) not in indirectEquivalenceTranslator:
indirectEquivalenceTranslator["{0}".format(fuzzyKey)] = []
if (
tuple(sorted([x[0] for x in reaction], key=len))
not in equivalenceTranslator["{0}".format(fuzzyKey)]
):
equivalenceTranslator["{0}".format(fuzzyKey)].append(
tuple(sorted([x[0] for x in reaction], key=len))
)
return
logMess(
"INFO:LAE004",
"{0}:{1}:added induced naming convention".format(
reaction[0][0], reaction[1][0]
),
)
equivalenceTranslator["{0}".format(fuzzyKey)] = []
if fuzzyKey == "0":
tmpState = "ON"
else:
tmpState = fuzzyKey.upper()
adhocLabelDictionary["{0}".format(fuzzyKey)] = [
"{0}".format(fuzzyKey),
tmpState,
]
# fill main naming convention data structure
self.namingConventions["modificationList"].append(
"{0}".format(fuzzyKey)
)
self.namingConventions["reactionState"].append(tmpState)
self.namingConventions["reactionSite"].append(
"{0}".format(fuzzyKey)
)
self.namingConventions["patterns"][fuzzyDifference] = "{0}".format(
fuzzyKey
)
self.namingConventions["definitions"].append(
{
"rst": len(self.namingConventions["reactionState"]) - 1,
"rsi": len(self.namingConventions["reactionSite"]) - 1,
}
)
if fuzzyKey not in translationKeys:
translationKeys.append(fuzzyKey)
# if this same definition doesnt already exist. this is to avoid cycles
if (
tuple(sorted([x[0] for x in reaction], key=len))
not in equivalenceTranslator["{0}".format(fuzzyKey)]
):
equivalenceTranslator["{0}".format(fuzzyKey)].append(
tuple(sorted([x[0] for x in reaction], key=len))
)
newTranslationKeys.append(fuzzyKey)
conventionDict[fuzzyDifference] = "{0}".format(fuzzyKey)
if "{0}".format(fuzzyKey) not in indirectEquivalenceTranslator:
indirectEquivalenceTranslator["{0}".format(fuzzyKey)] = []
return True
return False
# load the json config file
reactionDefinition = self.loadConfigFiles(self.configurationFile)
rawReactions = []
for x in reactions:
tmp = parseReactions(x)
if tmp:
rawReactions.append(tmp)
# rawReactions = [parseReactions(x) for x in reactions if parseReactions(x)]
strippedMolecules = [x.strip("()") for x in molecules]
reactionnetworkelements = set([z for x in rawReactions for y in x for z in y])
# only keep those molecuels that appear in the reaction network
strippedMolecules = [
x for x in strippedMolecules if x in reactionnetworkelements
]
# load user defined complexes
if self.speciesEquivalences != None:
self.userEquivalences = self.loadConfigFiles(self.speciesEquivalences)[
"reactionDefinition"
]
# determines if two molecules have a relationship according to the naming convention section
# equivalenceTranslator is a dictionary of actual modifications
# example {'Phosporylation':[['A','A_p'],['B','B_p']]}
# process straightforward naming conventions
# XXX: we should take this function out of processNamingConventions2 and all process that calls it
(
tmpTranslator,
translationKeys,
conventionDict,
) = detectOntology.analyzeNamingConventions(
strippedMolecules, self.userNamingConventions, similarityThreshold=10
)
userEquivalenceTranslator, _, _ = self.processNamingConventions2(
strippedMolecules, onlyUser=True
)
for element in tmpTranslator:
if element in userEquivalenceTranslator:
userEquivalenceTranslator[element].extend(tmpTranslator[element])
else:
userEquivalenceTranslator[element] = tmpTranslator[element]
equivalenceTranslator = copy(userEquivalenceTranslator)
newTranslationKeys = []
adhocLabelDictionary = {}
# lists of plain reactions
# process fuzzy naming conventions based on reaction information
indirectEquivalenceTranslator = {x: [] for x in equivalenceTranslator}
localSpeciesDict = defaultdict(lambda: defaultdict(list))
trueBindingReactions = []
# the lexical dependencyGraph merely applies lexical analysis to detect which components in the left hand size
# matches to different ones in the right hand size of a given reaction
lexicalDependencyGraph = defaultdict(list)
strippedMolecules = [x.strip("()") for x in molecules]
# only keep those molecuels that appear in the reaction network
strippedMolecules = [
x for x in strippedMolecules if x in reactionnetworkelements
]
for idx, reaction in enumerate(rawReactions):
flagstar = False
if (
len(reaction[0]) == 1
and len(reaction[1]) == 1
and len(reaction[0][0]) > len(reaction[1][0])
):
# unmodification/relaxatopn
flagstar = True
reaction = [reaction[1], reaction[0]]
# should we reuse information obtained from other methods?
# FIXME: instead of doing a simple split by '_' we should be comparing against the molecules in stripped molecules and split by smallest actionable units.
if externalDependencyGraph == {}:
# print('-----',reaction)
# reactantString, productString = self.breakByActionableUnit(reaction, strippedMolecules)
# print('...',reaction, reactantString, productString)
# if not reactantString or not productString:
reactantString = [x.split("_") for x in reaction[0]]
reactantString = [[y for y in x if y != ""] for x in reactantString]
productString = [x.split("_") for x in reaction[1]]
productString = [[y for y in x if y != ""] for x in productString]
else:
reactantString = []
productString = []
# check how the reactants are composed and add it to the list
for element in reaction[0]:
if (
element not in externalDependencyGraph
or externalDependencyGraph[element] == []
):
reactantString.append([element])
else:
reactantString.append(
deepcopy(externalDependencyGraph[element][0])
)
# same for products
for element in reaction[1]:
if (
element not in externalDependencyGraph
or externalDependencyGraph[element] == []
):
productString.append([element])
else:
productString.append(
deepcopy(externalDependencyGraph[element][0])
)
# remove those chemicals that match exactly on both sides since those are not interesting.
# and unlike lexical pattern matching we are not going to go around trying to increase string size
reactantString, productString = self.removeExactMatches(
reactantString, productString
)
if [0] in reactantString or [0] in productString:
continue
matching, matching2 = self.approximateMatching2(
reactantString, productString, strippedMolecules, translationKeys
)
# print(reaction, matching)
# if matching and flagstar:
# logMess('DEBUG:Atomization', 'inverting order of {0} for lexical analysis'.format([reaction[1], reaction[0]]))
flag = True
if matching:
for reactant, matches in zip(reaction[1], matching):
for match in matches:
pair = list(match)
pair.sort(key=len)
fuzzyList = self.processAdHocNamingConventions(
pair[0], pair[1], localSpeciesDict, False, strippedMolecules
)
for fuzzyReaction, fuzzyKey, fuzzyDifference in fuzzyList:
if (
fuzzyKey == None
and fuzzyReaction[0] != fuzzyReaction[1]
):
flag = False
# logMess('Warning:ATOMIZATION','We could not a meaningful \
# mapping in {0} when lexically analyzing {1}.'.format(pair,reactant))
createArtificialNamingConvention(
fuzzyReaction, fuzzyKey, fuzzyDifference
)
if (
flag
and sorted([x[1] for x in matches])
not in lexicalDependencyGraph[reactant]
):
# dont introduce cyclical dependencies
if all([x[1] != reactant for x in matches]):
lexicalDependencyGraph[reactant].append(
sorted([x[1] for x in matches])
)
for x in matches:
# TODO(Oct14): it would be better to try to map this to an
# existing molecule instead of trying to create a new one
if x[1] not in strippedMolecules:
if len(x[1]) > len(x[0]):
lexicalDependencyGraph[x[1]] = [[x[0]]]
else:
lexicalDependencyGraph[x[0]] = [[x[1]]]
lexicalDependencyGraph[x[1]] = []
translationKeys.extend(newTranslationKeys)
for species in localSpeciesDict:
speciesName = localSpeciesDict[species][
list(localSpeciesDict[species].keys())[0]
][0][0]
definition = [species]
sdefinition = [speciesName]
for component in localSpeciesDict[species]:
cdefinition = []
states = [
["s", state[1]] for state in localSpeciesDict[species][component]
]
for state in states:
cdefinition.extend(state)
cdefinition = [component, cdefinition]
sdefinition.extend(cdefinition)
definition.append([sdefinition])
self.lexicalSpecies.append(definition)
# definition = [commonRoot,[[commonRoot,componentName,["s",tag]]]]
reactionClassification = self.getReactionClassification(
reactionDefinition,
rawReactions,
equivalenceTranslator,
indirectEquivalenceTranslator,
translationKeys,
)
for element in trueBindingReactions:
reactionClassification[element] = "Binding"
listOfEquivalences = []
for element in equivalenceTranslator:
listOfEquivalences.extend(equivalenceTranslator[element])
return (
reactionClassification,
listOfEquivalences,
equivalenceTranslator,
indirectEquivalenceTranslator,
adhocLabelDictionary,
lexicalDependencyGraph,
userEquivalenceTranslator,
)
[docs] def processAnnotations(self, molecules, annotations):
processedAnnotations = []
for element in annotations:
if len(annotations[element]) > 1:
pro = [
list(x)
for x in itertools.combinations(
[y for y in annotations[element]], 2
)
]
processedAnnotations.extend(pro)
return {-1: processedAnnotations}
[docs] def classifyReactionsWithAnnotations(
self, reactions, molecules, annotations, labelDictionary
):
"""
this model will go through the list of reactions and assign a 'modification' tag to those reactions where
some kind of modification goes on aided through annotation information
"""
rawReactions = [parseReactions(x) for x in reactions]
equivalenceTranslator = self.processAnnotations(molecules, annotations)
for reactionIndex in range(0, len(rawReactions)):
for reactantIndex in range(0, len(rawReactions[reactionIndex])):
tmp = []
for chemicalIndex in range(
0, len(rawReactions[reactionIndex][reactantIndex])
):
tmp.extend(
list(
labelDictionary[
rawReactions[reactionIndex][reactantIndex][
chemicalIndex
]
]
)
)
rawReactions[reactionIndex][reactantIndex] = tmp
# self.annotationClassificationHelper(rawReactions,equivalenceTranslator[-1])
[docs] def userJsonToDataStructure(
self,
patternName,
userEquivalence,
dictionary,
labelDictionary,
equivalencesList,
):
"""
converts a user defined species to an internal representation
"""
tmp = st.Species()
label = []
for molecule in userEquivalence[1]:
if molecule[0] == 0:
labelDictionary[patternName] = 0
return
tmp2 = st.Molecule(molecule[0])
for componentIdx in range(1, len(molecule), 2):
tmp3 = st.Component(molecule[componentIdx])
for bindStateIdx in range(0, len(molecule[componentIdx + 1]), 2):
if molecule[componentIdx + 1][bindStateIdx] == "b":
tmp3.addBond(molecule[componentIdx + 1][bindStateIdx + 1])
elif molecule[componentIdx + 1][bindStateIdx] == "s":
tmp3.addState("0")
tmp3.addState(molecule[componentIdx + 1][bindStateIdx + 1])
equivalencesList.append([patternName, molecule[0]])
# tmp3.addState(molecule[2][2])
tmp2.addComponent(tmp3)
stmp = st.Species()
stmp.addMolecule(deepcopy(tmp2))
stmp.reset()
# in case one definition overlaps another
if molecule[0] in dictionary:
dictionary[molecule[0]].extend(deepcopy(stmp))
else:
dictionary[molecule[0]] = deepcopy(stmp)
labelDictionary[molecule[0]] = [(molecule[0],)]
label.append(molecule[0])
# for component in tmp2.components:
# if component.name == molecule[1]:
# component.setActiveState(molecule[2][1])
tmp.addMolecule(tmp2)
if patternName in dictionary:
dictionary[patternName].extend(deepcopy(tmp))
else:
dictionary[patternName] = deepcopy(tmp)
labelDictionary[patternName] = [tuple(label)]
[docs] def getUserDefinedComplexes(self):
dictionary = {}
partialDictionary = {}
userLabelDictionary = {}
equivalencesList = []
lexicalLabelDictionary = {}
if self.speciesEquivalences is not None:
speciesdictionary = self.loadConfigFiles(self.speciesEquivalences)
userEquivalences = (
speciesdictionary["complexDefinition"]
if "complexDefinition" in speciesdictionary
else None
)
for element in userEquivalences:
self.userJsonToDataStructure(
element[0],
element,
dictionary,
userLabelDictionary,
equivalencesList,
)
complexEquivalences = speciesdictionary["modificationDefinition"]
for element in complexEquivalences:
userLabelDictionary[element] = [tuple(complexEquivalences[element])]
partialUserEquivalences = (
speciesdictionary["partialComplexDefinition"]
if "partialComplexDefinition" in speciesdictionary
else []
)
for element in partialUserEquivalences:
self.userJsonToDataStructure(
tuple(sorted(element[0])), element, partialDictionary, {}, []
)
# stuff we got from string similarity
for element in self.lexicalSpecies:
self.userJsonToDataStructure(
element[0],
element,
dictionary,
lexicalLabelDictionary,
equivalencesList,
)
return (
dictionary,
userLabelDictionary,
lexicalLabelDictionary,
partialDictionary,
)