Source code for bionetgen.atomizer.utils.pathwaycommons

import urllib.request, urllib.parse, urllib.error
import urllib.request, urllib.error, urllib.parse
import functools
import marshal
from .util import logMess
import json


[docs]def memoize(obj):
    cache = obj.cache = {}

    @functools.wraps(obj)
    def memoizer(*args, **kwargs):
        # key = str(args) + str(kwargs)
        key = marshal.dumps([str(obj), args, kwargs])
        if key not in cache:
            cache[key] = obj(*args, **kwargs)
        return cache[key]

    return memoizer


'''
from bioservices import UniProt
u = UniProt(verbose=False)
@memoize
def name2uniprot(nameStr):
    """
    get the uniprot id for a given biological name. gives preference to human data
    """
    data = u.search('{0}+AND+organism:9606'.format(nameStr), limit=5, columns="entry name,id")

    if len(data) == 0:
        data = u.search('{0}'.format(nameStr), limit=10,columns="entry name,id")
    parsedData = [x.split('\t') for x in data.split('\n')][1:]
    if len([x for x in parsedData if nameStr in x[0]]) > 0:
        return [x[1] for x in parsedData if nameStr in x[0]]
    return [x[1] for x in parsedData if len(x) == 2]
'''


[docs]@memoize
def queryBioGridByName(name1, name2, organism, truename1, truename2):
    url = "http://webservice.thebiogrid.org/interactions/?"
    response = None
    if organism:
        organismExtract = list(organism)[0].split("/")[-1]
        d = {
            "geneList": "|".join([name1, name2]),
            "taxId": "|".join(organism),
            "format": "json",
            "accesskey": "f74b8d6f4c394fcc9d97b11c8c83d7f3",
            "includeInteractors": "false",
        }
        # FIXME: check if all "organism"s are the wrong thing,
        # for model 48 this returns a process identifier https://www.ebi.ac.uk/QuickGO/term/GO:0007173
        # and not an organism taxonomy identifier
        data = urllib.parse.urlencode(d).encode("utf-8")
        try:
            response = urllib.request.urlopen(url, data=data).read()
        except urllib.error.HTTPError:
            logMess(
                "ERROR:MSC02",
                "A connection could not be established to biogrid while testing with taxon {1} and genes {0}, trying without organism taxonomy limitation".format(
                    "|".join([name1, name2]), "|".join(organism)
                ),
            )
            # return False

    if response is None:
        d = {
            "geneList": "|".join([name1, name2]),
            "format": "json",
            "accesskey": "f74b8d6f4c394fcc9d97b11c8c83d7f3",
            "includeInteractors": "false",
        }
        data = urllib.parse.urlencode(d).encode("utf-8")
        try:
            response = urllib.request.urlopen(url, data=data).read()
        except urllib.error.HTTPError:
            logMess("ERROR:MSC02", "A connection could not be established to biogrid")
            return False
    results = json.loads(response)
    referenceName1 = truename1.lower() if truename1 else name1.lower()
    referenceName2 = truename2.lower() if truename2 else name2.lower()
    for result in results:
        resultName1 = results[result]["OFFICIAL_SYMBOL_A"].lower()
        resultName2 = results[result]["OFFICIAL_SYMBOL_B"].lower()
        synonymName1 = results[result]["SYNONYMS_A"].split("|")
        synonymName1 = [x.lower() for x in synonymName1]
        synonymName2 = results[result]["SYNONYMS_B"].split("|")
        synonymName2 = [x.lower() for x in synonymName2]
        # FIXME: This should correctly warn the user where the interaction is coming
        # from exactly
        # FIXME: Let the user select individual interactions to include. Maybe an
        # interactive mode
        if truename1 != None and truename2 != None and resultName1 != resultName2:
            logMess(
                "WARNING:ATO005",
                "BioGrid result only matched a synonym. "
                + f"{resultName1} to {resultName2}",
            )
            return True
        elif (
            truename1 != None
            and truename2 != None
            and truename1 == truename2
            and resultName1 == resultName2
        ):
            logMess(
                "WARNING:ATO005",
                "BioGrid result only matched a synonym. "
                + f"{truename1} to {truename2} or "
                + f"{resultName1} to {resultName2}",
            )
            return True
        if (referenceName1 == resultName1 or referenceName1 in synonymName1) and (
            referenceName2 == resultName2 or referenceName2 in synonymName2
        ):
            logMess(
                "WARNING:ATO005",
                "BioGrid result only matched a synonym. "
                + f"{referenceName1} to {resultName1} or "
                + f"{referenceName1} to {synonymName1} or "
                + f"{referenceName2} to {resultName2} or "
                + f"{referenceName2} to {synonymName2}",
            )
            return True
        if (referenceName2 == resultName1 or referenceName2 in synonymName1) and (
            referenceName1 == resultName2 or referenceName1 in synonymName2
        ):
            logMess(
                "WARNING:ATO005",
                "BioGrid result only matched a synonym. "
                + f"{referenceName2} to {resultName1} or "
                + f"{referenceName2} to {synonymName1} or "
                + f"{referenceName1} to {resultName2} or "
                + f"{referenceName1} to {synonymName2}",
            )
            return True

    return False


[docs]@memoize
def queryActiveSite(nameStr, organism):
    url = "http://www.uniprot.org/uniprot/?"

    response = None
    retry = 0
    while retry < 3:
        retry += 1
        if organism:
            organismExtract = list(organism)[0].split("/")[-1]
            # ASS - Updating the query to conform with a regular RESTful API request and work in Python3
            xparams = {
                "query": "{}+AND+organism:{}".format(nameStr, organismExtract),
                "columns": "name,id,feature(ACTIVE SITE)",
                "format": "tab",
                "limit": "5",
                "sort": "score",
            }
            xparams = urllib.parse.urlencode(xparams).encode("utf-8")
            try:
                xparams = urllib.parse.urlencode(xparams).encode("utf-8")
                req = urllib.request.Request(url)
                with urllib.request.urlopen(req, data=xparams) as f:
                    response = f.read().decode("utf-8")
            except urllib.error.HTTPError:
                logMess(
                    "ERROR:MSC03", "A connection could not be established to uniprot"
                )
        response = str(response)
        if response in ["", None]:
            url = "http://www.uniprot.org/uniprot/?"
            # ASS - Updating the query to conform with a regular RESTful API request and work in Python3
            xparams = {
                "query": nameStr,
                "columns": "name,id,feature(ACTIVE SITE)",
                "format": "tab",
                "limit": "5",
                "sort": "score",
            }
            xparams = urllib.parse.urlencode(xparams).encode("utf-8")
            try:
                req = urllib.request.Request(url, data=xparams)
                with urllib.request.urlopen(req) as f:
                    response = f.read().decode("utf-8")
            except urllib.error.HTTPError:
                logMess(
                    "ERROR:MSC03", "A connection could not be established to uniprot"
                )
    response = str(response)
    if not response:
        return response
    parsedData = [x.split("\t") for x in response.split("\n")][1:]
    # return parsedData
    return [
        x[0]
        for x in parsedData
        if len(x) == 3
        and any(nameStr.lower() in z for z in [y.lower() for y in x[0].split("_")])
        and len(x[2]) > 0
    ]


[docs]@memoize
def name2uniprot(nameStr, organism):
    url = "http://www.uniprot.org/uniprot/?"

    response = None
    if organism:
        organismExtract = list(organism)[0].split("/")[-1]
        d = {
            "query": f"{nameStr}+AND+organism:{organismExtract}",
            "format": "tab&limit=5",
            "columns": "entry name,id",
            "sort": "score",
        }
        data = urllib.parse.urlencode(d).encode("utf-8")
        try:
            response = urllib.request.urlopen(url, data=data).read()
        except urllib.error.HTTPError:
            logMess("ERROR:MSC03", "A connection could not be established to uniprot")
            return None

    if response in ["", None]:
        url = "http://www.uniprot.org/uniprot/?"
        d = {
            "query": f"{nameStr}",
            "format": "tab&limit=5",
            "columns": "entry name,id",
            "sort": "score",
        }
        data = urllib.parse.urlencode(d).encode("utf-8")
        try:
            response = urllib.request.urlopen(url, data=data).read()
        except urllib.error.HTTPError:
            return None
    parsedData = [x.split("\t") for x in str(response).split("\n")][1:]
    return [
        x[1]
        for x in parsedData
        if len(x) == 2
        and any(nameStr.lower() in z for z in [y.lower() for y in x[0].split("_")])
    ]


[docs]@memoize
def getReactomeBondByUniprot(uniprot1, uniprot2):
    """
    Queries reactome to see if two proteins references by their uniprot id
    are bound in the same complex
    """
    url = "http://www.pathwaycommons.org/pc2/graph"
    d = {
        "kind": "PATHSFROMTO",
        "format": "EXTENDED_BINARY_SIF",
        "source": "|".join(uniprot1),
        "target": "|".join(uniprot2),
    }
    data = urllib.parse.urlencode(d).encode("utf-8")
    # query reactome
    try:
        response = urllib.request.urlopen(url, data=data).read()
    except urllib.error.HTTPError:
        # logMess('ERROR:pathwaycommons','A connection could not be established to pathwaycommons')
        return None
    # divide by line
    parsedResponse = [x.split("\t") for x in str(response).split("\n")]

    # response is divided in two  sections. actual protein-protein relationships and protein descriptors
    separation = [i for i, x in enumerate(parsedResponse) if len(x) < 2]

    # separate the first half and focus on actual ppi entries
    ppi = [x for x in parsedResponse[: separation[0]] if x[1] == "in-complex-with"]
    # ppi = [x for x in parsedResponse[:separation[0]]]
    # get protein descriptors and filter by the initial uniprot id given in the method parameters
    includedElements = [[x[0], x[-1]] for x in parsedResponse[separation[0] :]]
    includedElements1 = [
        x for x in includedElements if any(y in x[1] for y in uniprot1)
    ]
    includedElements2 = [
        x for x in includedElements if any(y in x[1] for y in uniprot2)
    ]
    includedElements1 = [x[0] for x in includedElements1]
    includedElements2 = [x[0] for x in includedElements2]
    # filter protein interaction by those uniprot ids and names we truly care about
    ppi = [
        x[0:3]
        for x in ppi
        if (
            len([y for y in includedElements1 if y == x[0]]) == 1
            and len([y for y in includedElements2 if y == x[2]]) == 1
        )
        or (
            len([y for y in includedElements1 if y == x[2]]) == 1
            and len([y for y in includedElements2 if y == x[0]]) == 1
        )
    ]
    # ppi = [x[0:3] for x in ppi if len([y for y in includedElements1 if y in x]) == 1 and len([y for y in includedElements2 if y in x]) == 1]
    return ppi


[docs]@memoize
def getReactomeBondByName(name1, name2, sbmlURI, sbmlURI2, organism=None):
    """
    resolves the uniprot id of parameters *name1* and *name2* and obtains whether they
    can be bound in the same complex or not based on reactome information
    """
    if len(sbmlURI) > 0:
        uniprot1 = [x.split("/")[-1] for x in sbmlURI]
    else:
        uniprot1 = name2uniprot(name1, organism)
    if len(sbmlURI2) > 0:
        uniprot2 = [x.split("/")[-1] for x in sbmlURI2]
    else:
        uniprot2 = name2uniprot(name2, organism)
    uniprot1 = uniprot1 if uniprot1 else [name1]
    uniprot2 = uniprot2 if uniprot2 else [name2]
    result = getReactomeBondByUniprot(uniprot1, uniprot2)
    return result


[docs]def isInComplexWith(name1, name2, sbmlURI=[], sbmlURI2=[], organism=None):
    nameset = sorted([name1, name2], key=lambda x: x[0])
    result = None
    retry = 0
    while retry < 3:
        result = getReactomeBondByName(
            nameset[0][0], nameset[1][0], nameset[0][1], nameset[1][1], organism
        )
        retry += 1
        if result:
            return any([x[1] == "in-complex-with" for x in result])
    return False


if __name__ == "__main__":
    # pass
    # results =  isInComplexWith('Crk','Ras')
    # print(getReactomeBondByName('EGF', 'Grb2', [], []))
    print((queryActiveSite("EGF", "")))
    # print(getReactomeBondByName('EGF', 'EGF', ['P07522'], ['P07522']))
    # print(name2uniprot('MEKK1'))
    # print(results)
    # print(getReactomeBondByUniprot('Q9QX70','Q9QX70'))
    # print(getReactomeBondByUniprot('P07522','P07522'))