Source code for vivarium.processes.translation

'''
======================
Stochastic Translation
======================
'''

import copy
import numpy as np
import random
import logging as log
from arrow import StochasticSystem

from vivarium.core.process import Process
from vivarium.core.experiment import pp
from vivarium.data.amino_acids import amino_acids
from vivarium.data.molecular_weight import molecular_weight
from vivarium.library.datum import Datum
from vivarium.library.polymerize import Elongation, Polymerase, Template, build_stoichiometry, all_products, generate_template
from vivarium.library.dict_utils import deep_merge

[docs]class Ribosome(Polymerase):
    pass

[docs]class Transcript(Template):
    pass

[docs]def shuffle(l):
    l = [item for item in l]
    np.random.shuffle(l)
    return l

[docs]def random_string(alphabet, length):
    string = ''
    for step in range(length):
        string += random.choice(alphabet)
    return string

#: Variable name for unbound ribosomes
UNBOUND_RIBOSOME_KEY = 'Ribosome'

monomer_symbols = []
monomer_ids = []

for symbol, id in amino_acids.items():
    monomer_symbols.append(symbol)
    monomer_ids.append(id)

A = random_string(monomer_symbols, 20)
Z = random_string(monomer_symbols, 60)
B = random_string(monomer_symbols, 30)
Y = random_string(monomer_symbols, 40)

[docs]def gather_genes(affinities):
    genes = {}
    for operon, product in affinities.keys():
        if not operon in genes:
            genes[operon] = []
        genes[operon].append(product)
    return genes

[docs]def transcripts_to_gene_counts(transcripts, operons):
    counts = {}
    for transcript, genes in operons.items():
        for gene in genes:
            counts[(transcript, gene)] = transcripts.get(transcript, 0)
    return counts

[docs]class Translation(Process):

    name = 'translation'
    defaults = {

        'sequences': {
            ('oA', 'eA'): A,
            ('oAZ', 'eA'): A,
            ('oAZ', 'eZ'): Z,
            ('oB', 'eB'): B,
            ('oBY', 'eB'): B,
            ('oBY', 'eY'): Y},

        'templates': {
            ('oA', 'eA'): generate_template(('oA', 'eA'), 20, ['eA']),
            ('oAZ', 'eA'): generate_template(('oAZ', 'eA'), 20, ['eA']),
            ('oAZ', 'eZ'): generate_template(('oAZ', 'eZ'), 60, ['eZ']),
            ('oB', 'eB'): generate_template(('oB', 'eB'), 30, ['eB']),
            ('oBY', 'eB'): generate_template(('oBY', 'eB'), 30, ['eB']),
            ('oBY', 'eY'): generate_template(('oBY', 'eY'), 40, ['eY'])},

        'transcript_affinities': {
            ('oA', 'eA'): 1.0,
            ('oAZ', 'eA'): 2.0,
            ('oAZ', 'eZ'): 5.0,
            ('oB', 'eB'): 1.0,
            ('oBY', 'eB'): 2.0,
            ('oBY', 'eY'): 5.0},

        'elongation_rate': 5.0,
        'polymerase_occlusion': 10,
        'symbol_to_monomer': amino_acids,
        'monomer_ids': monomer_ids,
        'concentration_keys': [],

        'mass_deriver_key': 'mass_deriver',
        'concentrations_deriver_key': 'translation_concentrations',
        'time_step': 1.0,
    }

    def __init__(self, initial_parameters=None):
        '''A stochastic translation model

        .. WARNING::
            Vivarium's knowledge base uses the gene name to name the
            protein. This means that for a gene acrA that codes for
            protein ArcA, you must refer to the gene, transcript, and
            protein each as acrA.

        .. DANGER::
            This documentation will need to be updated to reflect the
            changes in `#185
            <https://github.com/CovertLab/vivarium/pull/185>`_

        :term:`Ports`:

        * **ribosomes**: Expects the ``ribosomes`` variable, whose
          value is a list of the configurations of the ribosomes
          currently active.
        * **molecules**: Expects variables for each of the RNA
          nucleotides.
        * **transcripts**: Expects variables for each transcript to
          translate. Translation will read transcripts from this port.
        * **proteins**: Expects variables for each protein product. The
          produced proteins will be added to this port as counts.
        * **concentrations**: Expects variables for each key in
          ``concentration_keys``. This will be used by a :term:`deriver`
          to convert counts to concentrations.

        Arguments:
            initial_parameters: A dictionary of configuration options.
                Accepts the following keys:

                * **sequences** (:py:class:`dict`): Maps from operon
                  name to the RNA sequence of the operon, as a
                  :py:class:`str`.
                * **templates** (:py:class:`dict`): Maps from the name
                  of an transcript to a :term:`template specification`.
                  The template specification may be generated by
                  :py:func:`vivarium.library.polymerize.generate_template`
                  like so:

                  >>> from vivarium.library.polymerize import (
                  ...     generate_template)
                  >>> from vivarium.library.pretty import format_dict
                  >>> terminator_index = 5
                  >>> template = generate_template(
                  ...     'oA', terminator_index, ['product1'])
                  >>> print(format_dict(template))
                  {
                      "direction": 1,
                      "id": "oA",
                      "position": 0,
                      "sites": [],
                      "terminators": [
                          {
                              "position": 5,
                              "products": [
                                  "product1"
                              ],
                              "strength": 1.0
                          }
                      ]
                  }


                * **transcript_affinities** (:py:class:`dict`): A map
                  from the name of a transcript to the binding affinity
                  (a :py:class:`float`) of the ribosome for the
                  transcript.
                * **elongation_rate** (:py:class:`float`): The
                  elongation rate of the ribosome.

                  .. todo:: Units of elongation rate

                * **polymerase_occlusion** (:py:class:`int`): The number
                  of base pairs behind the polymerase where another
                  polymerase is occluded and so cannot bind.
                * **symbol_to_monomer** (:py:class:`dict`): Maps from
                  the symbols used to represent monomers in the RNA
                  sequence to the name of the free monomer. This should
                  generally be
                  :py:data:`vivarium.data.amino_acids.amino_acids`.
                * **monomer_ids** (:py:class:`list`): A list of the
                  names of the free monomers consumed by translation.
                  This can generally be computed as:

                  >>> import pprint
                  >>>
                  >>> from vivarium.data.amino_acids import amino_acids
                  >>> monomer_ids = amino_acids.values()
                  >>> pp = pprint.PrettyPrinter()
                  >>> pp.pprint(list(monomer_ids))
                  ['Alanine',
                   'Arginine',
                   'Asparagine',
                   'Aspartate',
                   'Cysteine',
                   'Glutamate',
                   'Glutamine',
                   'Glycine',
                   'Histidine',
                   'Isoleucine',
                   'Leucine',
                   'Lysine',
                   'Methionine',
                   'Phenylalanine',
                   'Proline',
                   'Serine',
                   'Threonine',
                   'Tryptophan',
                   'Tyrosine',
                   'Valine']

                  Note that we only included the `list()` transformation
                  to make the output prettier. The `dict_values` object
                  returned by `.values()` is sufficiently list-like for
                  use here. Also note that :py:mod:`pprint` just makes
                  the output prettier.
                * **concentration_keys** (:py:class:`list`): A list of
                  variables you want to be able to access as
                  concentrations from the *concentrations* port. The
                  actual conversion is handled by a deriver.

        Example configuring the process (uses
        :py:func:vivarium.library.pretty.format_dict):

        >>> from vivarium.library.pretty import format_dict
        >>> from vivarium.data.amino_acids import amino_acids
        >>> from vivarium.library.polymerize import generate_template
        >>> random.seed(0)  # Needed because process is stochastic
        >>> np.random.seed(0)
        >>> configurations = {
        ...     'sequences': {
        ...         ('oA', 'eA'): 'AWDPT',
        ...         ('oAZ', 'eZ'): 'YVEGELENGGMFISC',
        ...     },
        ...     'templates': {
        ...         ('oA', 'eA'): generate_template(('oA', 'eA'), 5, ['eA']),
        ...         ('oAZ', 'eZ'): generate_template(('oAZ', 'eZ'), 15, ['eA', 'eZ']),
        ...     },
        ...     'transcript_affinities': {
        ...         ('oA', 'eA'): 1.0,
        ...         ('oAZ', 'eZ'): 1.0,
        ...     },
        ...     'elongation_rate': 10.0,
        ...     'polymerase_occlusion': 10,
        ...     'symbol_to_monomer': amino_acids,
        ...     'monomer_ids': amino_acids.values(),
        ...     'concentration_keys': []
        ... }
        >>> # make the translation process, and initialize the states
        >>> translation = Translation(configurations)  # doctest:+ELLIPSIS
        >>> states = {
        ...     'ribosomes': {},
        ...     'molecules': {},
        ...     'proteins': {UNBOUND_RIBOSOME_KEY: 2},
        ...     'transcripts': {
        ...         'oA': 10,
        ...         'oAZ': 10,
        ...     }
        ... }
        >>> states['molecules'].update(
        ...     {
        ...         molecule_id: 100
        ...         for molecule_id in translation.monomer_ids
        ...     }
        ... )
        >>> update = translation.next_update(1, states)
        >>> print(update['ribosomes'])
        {'_add': [{'path': (1,), 'state': <class 'vivarium.processes.translation.Ribosome'>: {'id': 1, 'state': 'occluding', 'position': 9, 'template': ('oAZ', 'eZ'), 'template_index': 0, 'terminator': 0}}, {'path': (2,), 'state': <class 'vivarium.processes.translation.Ribosome'>: {'id': 2, 'state': 'occluding', 'position': 9, 'template': ('oAZ', 'eZ'), 'template_index': 0, 'terminator': 0}}], '_delete': []}
        '''

        if not initial_parameters:
            initial_parameters = {}

        self.monomer_symbols = list(amino_acids.keys())
        self.monomer_ids = list(amino_acids.values())

        self.default_parameters = copy.deepcopy(self.defaults)

        templates = self.or_default(initial_parameters, 'templates')

        self.default_parameters['protein_ids'] = all_products({
            key: Template(config)
            for key, config in templates.items()})

        self.default_parameters['transcript_order'] = list(
            initial_parameters.get(
                'transcript_affinities',
                self.default_parameters['transcript_affinities']).keys())
        self.default_parameters['molecule_ids'] = self.monomer_ids

        self.parameters = copy.deepcopy(self.default_parameters)
        self.parameters.update(initial_parameters)

        self.sequences = self.parameters['sequences']
        self.templates = self.parameters['templates']

        self.transcript_affinities = self.parameters['transcript_affinities']
        self.operons = gather_genes(self.transcript_affinities)
        self.operon_order = list(self.operons.keys())
        self.transcript_order = self.parameters['transcript_order']
        self.transcript_count = len(self.transcript_order)

        self.monomer_ids = self.parameters['monomer_ids']
        self.molecule_ids = self.parameters['molecule_ids']
        self.molecule_ids.extend(['ATP', 'ADP'])

        self.protein_ids = self.parameters['protein_ids']
        self.symbol_to_monomer = self.parameters['symbol_to_monomer']
        self.elongation = 0
        self.elongation_rate = self.parameters['elongation_rate']
        self.polymerase_occlusion = self.parameters['polymerase_occlusion']
        self.concentration_keys = self.parameters['concentration_keys']

        self.affinity_vector = np.array([
            self.transcript_affinities[transcript_key]
            for transcript_key in self.transcript_order], dtype=np.float64)

        self.stoichiometry = build_stoichiometry(self.transcript_count)

        self.initiation = StochasticSystem(self.stoichiometry)

        self.ribosome_id = 0

        self.protein_keys = self.concentration_keys + self.protein_ids
        self.all_protein_keys = self.protein_keys + [UNBOUND_RIBOSOME_KEY]

        self.mass_deriver_key = self.or_default(initial_parameters, 'mass_deriver_key')
        self.concentrations_deriver_key = self.or_default(
            initial_parameters, 'concentrations_deriver_key')

        log.info('translation parameters: {}'.format(self.parameters))

        super(Translation, self).__init__(self.parameters)

[docs]    def ports_schema(self):

        def add_mass(schema, masses, key):
            if '_properties' not in schema:
                schema['_properties'] = {}
            if key in masses:
                schema['_properties']['mw'] = masses[key]
            return schema

        return {
            'ribosomes': {
                '*': {
                    'id': {
                        '_default': -1,
                        '_updater': 'set'},
                    'domain': {
                        '_default': 0,
                        '_updater': 'set'},
                    'state': {
                        '_default': None,
                        '_updater': 'set',
                        '_emit': True},
                    'position': {
                        '_default': 0,
                        '_updater': 'set',
                        '_emit': True},
                    'template': {
                        '_default': None,
                        '_updater': 'set',
                        '_emit': True},
                    'template_index': {
                        '_default': 0,
                        '_updater': 'set',
                        '_emit': True}}},

            'global': {},

            'molecules': {
                molecule: add_mass({
                    '_emit': True,
                    '_default': 0,
                    '_divider': 'split'}, molecular_weight, molecule)
                for molecule in self.molecule_ids},

            'transcripts': {
                transcript: add_mass({
                    '_default': 0,
                    '_divider': 'split'}, molecular_weight, transcript)
                for transcript in list(self.operons.keys())},

            'proteins': {
                protein: add_mass({
                    '_default': 0,
                    '_divider': 'split',
                    '_emit': True}, molecular_weight, protein)
                for protein in self.all_protein_keys},

            'concentrations': {
                molecule: {
                    '_default': 0.0,
                    '_updater': 'set'}
                for molecule in self.protein_keys}}

[docs]    def derivers(self):
        return {
            self.mass_deriver_key: {
                'deriver': 'mass_deriver',
                'port_mapping': {
                    'global': 'global'}},
            self.concentrations_deriver_key: {
                'deriver': 'concentrations_deriver',
                'port_mapping': {
                    'global': 'global',
                    'counts': 'proteins',
                    'concentrations': 'concentrations'},
                'config': {
                    'concentration_keys': self.protein_keys}}}

[docs]    def next_update(self, timestep, states):
        molecules = states['molecules']
        transcripts = states['transcripts']
        proteins = states['proteins']

        ribosomes = {
            id: Ribosome(ribosome)
            for id, ribosome in states['ribosomes'].items()}

        original_ribosome_keys = ribosomes.keys()

        gene_counts = np.array(
            list(transcripts_to_gene_counts(transcripts, self.operons).values()),
            dtype=np.int64)

        # Find out how many transcripts are currently blocked by a
        # newly initiated ribosome
        bound_transcripts = np.zeros(self.transcript_count, dtype=np.int64)
        ribosomes_by_transcript = {
            transcript_key: []
            for transcript_key in self.transcript_order}
        for ribosome in ribosomes.values():
            ribosomes_by_transcript[ribosome.template].append(ribosome)
        for index, transcript in enumerate(self.transcript_order):
            bound_transcripts[index] = len([
                ribosome
                for ribosome in ribosomes_by_transcript[transcript]
                if ribosome.is_bound()])

        # Make the state for a gillespie simulation out of total number of each
        # transcript not blocked by a bound ribosome, concatenated with the number
        # of each transcript that is bound by a ribosome.
        # These are the two states for each transcript the simulation
        # will operate on, essentially going back and forth between
        # bound and unbound states.

        original_unbound_ribosomes = proteins[UNBOUND_RIBOSOME_KEY]
        monomer_limits = {
            monomer: molecules[monomer]
            for monomer in self.monomer_ids}
        unbound_ribosomes = original_unbound_ribosomes

        templates = {
            key: Template(template)
            for key, template in self.templates.items()}

        time = 0
        now = 0
        elongation = Elongation(
            self.sequences,
            templates,
            monomer_limits,
            self.symbol_to_monomer,
            self.elongation)

        while time < timestep:
            # build the state vector for the gillespie simulation
            substrate = np.concatenate([
                gene_counts - bound_transcripts,
                bound_transcripts,
                [unbound_ribosomes]])

            # find number of monomers until next terminator
            distance = 1 / self.elongation_rate

            # find interval of time that elongates to the point of the next terminator
            interval = min(distance, timestep - time)

            if interval == distance:
                # perform the elongation until the next event
                terminations, monomer_limits, ribosomes = elongation.step(
                    interval,
                    monomer_limits,
                    ribosomes)
                unbound_ribosomes += terminations
            else:
                elongation.store_partial(interval)
                terminations = 0

            # run simulation for interval of time to next terminator
            result = self.initiation.evolve(
                interval,
                substrate,
                self.affinity_vector)

            # go through each event in the simulation and update the state
            ribosome_bindings = 0
            for now, event in zip(result['time'], result['events']):
                # ribosome has bound the transcript
                transcript_key = self.transcript_order[event]
                bound_transcripts[event] += 1

                self.ribosome_id += 1
                new_ribosome = Ribosome({
                    'id': self.ribosome_id,
                    'template': transcript_key,
                    'position': 0})
                new_ribosome.bind()
                new_ribosome.start_polymerizing()
                ribosomes[new_ribosome.id] = new_ribosome

                ribosome_bindings += 1
                unbound_ribosomes -= 1

            # deal with occluding rnap
            for ribosome in ribosomes.values():
                if ribosome.is_unoccluding(self.polymerase_occlusion):
                    bound_transcripts[ribosome.template_index] -= 1
                    ribosome.unocclude()

            time += interval

        # track how far elongation proceeded to start from next iteration
        self.elongation = elongation.elongation - int(elongation.elongation)

        proteins = {
            UNBOUND_RIBOSOME_KEY: unbound_ribosomes - original_unbound_ribosomes}
        proteins.update(elongation.complete_polymers)

        molecules = {
            key: count * -1
            for key, count in elongation.monomers.items()}

        original = set(original_ribosome_keys)
        current = set(ribosomes.keys())
        bound_ribosomes = current - original
        completed_ribosomes = original - current
        continuing_ribosomes = original - completed_ribosomes

        # ATP hydrolysis cost is 2 per amino acid elongation
        molecules['ATP'] = 0
        molecules['ADP'] = 0
        for count in elongation.monomers.values():
            molecules['ATP'] -= 2 * count
            molecules['ADP'] += 2 * count

        ribosome_updates = {
            id: ribosomes[id]
            for id in continuing_ribosomes}

        add_ribosomes = [
            {'path': (bound,), 'state': ribosomes[bound]}
            for bound in bound_ribosomes]

        delete_ribosomes = [
            (completed,)
            for completed in completed_ribosomes]

        ribosome_updates['_add'] = add_ribosomes
        ribosome_updates['_delete'] = delete_ribosomes

        update = {
            'ribosomes': ribosome_updates,
            'molecules': molecules,
            'proteins': proteins}

        return update


[docs]def test_translation():
    parameters = {}
    translation = Translation(parameters)

    states = {
        'ribosomes': {},
        'molecules': {'ATP': 100000},
        'proteins': {UNBOUND_RIBOSOME_KEY: 10},
        'transcripts': {
            'oA': 10,
            'oAZ': 10,
            'oB': 10,
            'oBY': 10}}
    states['molecules'].update({
        molecule_id: 100
        for molecule_id in translation.monomer_ids})

    update = translation.next_update(10.0, states)

    pp(update)
    print('complete!')



if __name__ == '__main__':
    test_translation()