Source code for pytaxize.ncbi.ncbi

import os
import requests
import datetime
from lxml import etree
import re
import json
import pkg_resources
from pytaxize.refactor import Refactor
from pytaxize.utils import *


def search(sci_com, modifier=None, rank_query=None):
    """
    Search NCBI's taxonomic data - get NCBI taxonomic IDs

    :param sci_com: list of common or scientific names
    :param modifier: A modifier to the `sci_com` given. Options include:
        Organism, Scientific Name, Common Name, All Names, Division, 
        Filter, Lineage, GC, MGC, Name Tokens, Next Level, PGC, Properties,
        Rank, Subtree, Synonym, Text Word. These are not checked, so make
        sure they are entered correctly, as is.
    :param rank_query: A taxonomic rank name to modify the query sent to NCBI.
        Though note that some data sources use atypical ranks, so inspect the
        data itself for options. Optional.

    :note: Remember to set your Entrez API key as `ENTREZ_KEY`

    :return: dict, named with values given to `sci_com`, 
        where each value in the dict is a list of NCBI taxonomic
        identifiers

    Usage::

        from pytaxize import ncbi

        ncbi.search(sci_com = "Apis")

        # Many names
        ncbi.search(sci_com=["Apis", "Puma concolor", "Pinus"])

        # Example with more than 1 result
        ncbi.search(sci_com='Satyrium')
        ncbi.search(sci_com=['Satyrium', 'Pinus'])

        # common names
        ncbi.search(sci_com = 'bear')
    """

    key = os.environ.get("ENTREZ_KEY")
    if key is None:
        raise Exception("ENTREZ_KEY is not defined")

    def func(name):
        name = re.sub(" ", "+", name)
        if modifier is not None:
            name = name + "[%s]" % modifier
        term = name
        if rank_query is not None:
            term = term + " AND %s[Rank]" % rank_query
        args = {"db": "taxonomy", "term": term, "api_key": key}
        tt = _entrez("esearch", args)
        stuff = tt.xpath("//IdList/Id")
        ids = [int(z.text) for z in stuff]
        if len(ids) > 1:
            ids = ",".join(map(str, ids))
        args = {"db": "taxonomy", "ID": ids, "api_key": key}
        res = _entrez("esummary", args)
        # docsums = res.xpath("//DocSum")[0].getchildren()
        docsums = res.xpath("//DocSum")
        out = []
        for x in range(len(docsums)):
            keys = [w.values()[0] for w in docsums[x][1:]]
            vals = [w.text for w in docsums[x][1:]]
            out.append(dict(zip(keys, vals)))
        return out

    sci_com = str2list(sci_com)
    temp = []
    for i in range(len(sci_com)):
        temp.append(func(sci_com[i]))
    return lists2dict(temp, sci_com)


def hierarchy(ids):
    """
    Get a full taxonomic hierarchy from NCBI

    :param ids: one or more NCBI taxonomy ids

    :note: Remember to set your Entrez API key as `ENTREZ_KEY`

    :return: dict, named with ids given to `ids`, 
        where each value in the dict is a list of taxa, each
        a dict with the fields ``ScientificName``, ``Rank``, and ``TaxId``

    Usage::

        from pytaxize import ncbi
        ncbi.hierarchy(ids=9606)
        ncbi.hierarchy(ids=[9606,55062,4231])
    """
    toget = ["ScientificName", "Rank", "TaxId"]
    key = os.environ.get("ENTREZ_KEY")
    if key is None:
        raise Exception("ENTREZ_KEY is not defined")
    if not isinstance(ids, list):
        ids = [ids]
    idz = ",".join([str(x) for x in ids])
    args = {"db": "taxonomy", "ID": idz, "api_key": key}
    res = _entrez("efetch", args)
    taxa = res.xpath("//TaxaSet/Taxon")
    out = []
    for x in range(len(taxa)):
        nodes = taxa[x].xpath(".//LineageEx/Taxon")
        tmp = [
            dict(zip(toget, [node.xpath(w)[0].text for w in toget])) for node in nodes
        ]
        tmp.append(dict(zip(toget, [taxa[x].xpath(w)[0].text for w in toget])))
        out.append(tmp)
    return dict(zip(ids, out))


def _entrez(path="esearch", args={}):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/%s.fcgi" % path
    tt = Refactor(url, args, request="get").xml()
    return tt


if __name__ == "__main__":
    import doctest

    doctest.testmod()