Source code for vivarium.processes.transcription

'''
========================
Stochastic Transcription
========================
'''

import os
import copy
import numpy as np
import logging as log
from arrow import StochasticSystem

from vivarium.library.units import units
from vivarium.library.dict_utils import deep_merge, keys_list
from vivarium.core.experiment import pp, pf
from vivarium.core.process import Process
from vivarium.core.composition import process_in_experiment
from vivarium.states.chromosome import Chromosome, Rnap, Promoter, frequencies, add_merge, toy_chromosome_config
from vivarium.library.polymerize import Elongation, build_stoichiometry, template_products
from vivarium.data.nucleotides import nucleotides

[docs]def choose_element(elements): if elements: choice = np.random.choice(len(elements), 1) return list(elements)[int(choice)]
#: Variable name for unbound RNA polymerase UNBOUND_RNAP_KEY = 'RNA Polymerase' monomer_ids = list(nucleotides.values()) #: The default configuration parameters for :py:class:`Transcription`
[docs]class Transcription(Process): name = 'transcription' defaults = { 'promoter_affinities': {}, 'transcription_factors': [], 'sequence': '', 'templates': {}, 'genes': {}, 'elongation_rate': 1.0, 'polymerase_occlusion': 5, 'symbol_to_monomer': nucleotides, 'monomer_ids': monomer_ids, 'concentrations_deriver_key': 'transcription_concentrations', 'initial_domains': { 0: { 'id': 0, 'lead': 0, 'lag': 0, 'children': []}}, 'molecule_ids': monomer_ids, 'time_step': 1.0, } def __init__(self, initial_parameters=None): '''A stochastic transcription model .. WARNING:: Vivarium's knowledge base uses the gene name to name the protein. This means that for a gene acrA that codes for protein AcrA, you must refer to the gene, transcript, and protein each as acrA. :term:`Ports`: * **chromosome**: The linked :term:`store` should hold a representation of the chromosome in the form returned by :py:meth:`vivarium.states.chromosome.Chromosome.to_dict`. * **molecules**: Expects variables with the names in the *molecule_ids* configuration. These are the monomers consumed by transcription. * **factors**: Expects variables for each transcription factor's concentration. * **transcripts**: The linked store should store the concentrations of the transcripts. * **proteins**: The linked store should hold the concentrations of the transcription factors and the RNA polymerase. Arguments: initial_parameters: The following configuration options may be provided: * **promoter_affinities** (:py:class:`dict`): Maps from binding state tuples to the binding affinity of RNA polymerase and the promoter when the promoter is at that binding state. The binding state of a promoter is which (if any) transcription factors are bound to the promoter. Such a binding state can be represented by a binding state tuple, which is a :py:class:`tuple` whose first element is the name of the promoter. All bound transcription factors are listed as subsequent elements. If no transcription factors are bound, the sole subsequent element is ``None``. .. todo:: What is the significance of the order in the binding state tuple? .. todo:: What are the units of the affinities? * **transcription_factors** (:py:class:`list`): A list of all modeled transcription factors. * **sequence**: The DNA sequence that includes all the genes whose transcription is being modeled. * **templates** (:py:class:`dict`): Maps from the name of an operon to that operon's :term:`template specification`. * **genes** (:py:class:`dict`): Maps from operon name to a list of the names of the genes in that operon. * **elongation_rate** (:py:class:`float`): The elongation rate of the RNA polymerase. * **polymerase_occlusion** (:py:class:`int`): The number of base pairs behind the polymerase where another polymerase is occluded and so cannot bind. * **symbol_to_monomer** (:py:class:`dict`): Maps from the symbols used to represent monomers in the RNA sequence to the name of the free monomer. This should generally be :py:data:`vivarium.data.nucleotides.nucleotides`. * **monomer_ids** (:py:class:`list`): A list of the names of the free monomers consumed by transcription. This can generally be computed as: >>> from vivarium.data.nucleotides import nucleotides >>> monomer_ids = nucleotides.values() >>> print(list(monomer_ids)) ['ATP', 'GTP', 'UTP', 'CTP'] Note that we only included the ``list()`` transformation to make the output prettier. The ``dict_values`` object returned by the ``.values()`` call is sufficiently list-like for use here. * **molecule_ids** (:py:class:`list`): A list of all the molecules needed by the :term:`process`. This will generally be the same as *monomer_ids*. Example configuring the process (uses :py:func:`vivarium.library.pretty.format_dict`): >>> import random >>> >>> import numpy as np >>> >>> from vivarium.states.chromosome import ( ... toy_chromosome_config, ... Chromosome, ... ) >>> from vivarium.data.nucleotides import nucleotides >>> # format_dict lets us print dictionaries prettily >>> from vivarium.library.pretty import format_dict >>> >>> random.seed(0) # Needed because process is stochastic >>> np.random.seed(0) >>> # We will use the toy chromosome from toy_chromosome_config >>> print(toy_chromosome_config) {'sequence': 'ATACGGCACGTGACCGTCAACTTA', 'genes': {'oA': ['eA'], 'oAZ': ['eA', 'eZ'], 'oB': ['eB'], 'oBY': ['eB', 'eY']}, 'promoter_order': ['pA', 'pB'], 'promoters': {'pA': {'id': 'pA', 'position': 3, 'direction': 1, 'sites': [{'position': 0, 'length': 3, 'thresholds': {'tfA': <Quantity(0.3, 'millimolar')>}}], 'terminators': [{'position': 6, 'strength': 0.5, 'products': ['oA']}, {'position': 12, 'strength': 1.0, 'products': ['oAZ']}]}, 'pB': {'id': 'pB', 'position': -3, 'direction': -1, 'sites': [{'position': 0, 'length': 3, 'thresholds': {'tfB': <Quantity(0.5, 'millimolar')>}}], 'terminators': [{'position': -9, 'strength': 0.5, 'products': ['oB']}, {'position': -12, 'strength': 1.0, 'products': ['oBY']}]}}, 'promoter_affinities': {('pA', None): 1.0, ('pA', 'tfA'): 10.0, ('pB', None): 1.0, ('pB', 'tfB'): 10.0}, 'domains': {0: {'id': 0, 'lead': 0, 'lag': 0, 'children': []}}, 'rnaps': {}} >>> monomer_ids = list(nucleotides.values()) >>> configuration = { ... 'promoter_affinities': { ... ('pA', None): 1.0, ... ('pA', 'tfA'): 10.0, ... ('pB', None): 1.0, ... ('pB', 'tfB'): 10.0}, ... 'transcription_factors': ['tfA', 'tfB'], ... 'sequence': toy_chromosome_config['sequence'], ... 'templates': toy_chromosome_config['promoters'], ... 'genes': toy_chromosome_config['genes'], ... 'elongation_rate': 10.0, ... 'polymerase_occlusion': 5, ... 'symbol_to_monomer': nucleotides, ... 'monomer_ids': monomer_ids, ... 'molecule_ids': monomer_ids, ... } >>> # At this point we haven't used the toy chromosome yet >>> # because it will be specified in the chromosome port. >>> # Notice that the parameters are specific to the chromosome. >>> transcription_process = Transcription(configuration) >>> # Now we need to initialize the simulation stores >>> state = { ... 'chromosome': toy_chromosome_config, ... 'molecules': { ... nucleotide: 10 ... for nucleotide in monomer_ids ... }, ... 'proteins': {UNBOUND_RNAP_KEY: 10}, ... 'factors': {'tfA': 0.2 * units.mM, 'tfB': 0.7 * units.mM}, ... } >>> update = transcription_process.next_update(1.0, state) >>> print(update['chromosome']) {'rnaps': {'_add': [{'path': (2,), 'state': <class 'vivarium.states.chromosome.Rnap'>: {'id': 2, 'template': 'pA', 'template_index': 0, 'terminator': 1, 'domain': 0, 'state': 'polymerizing', 'position': 7}}, {'path': (3,), 'state': <class 'vivarium.states.chromosome.Rnap'>: {'id': 3, 'template': 'pB', 'template_index': 1, 'terminator': 0, 'domain': 0, 'state': 'occluding', 'position': 3}}, {'path': (4,), 'state': <class 'vivarium.states.chromosome.Rnap'>: {'id': 4, 'template': 'pA', 'template_index': 0, 'terminator': 0, 'domain': 0, 'state': 'occluding', 'position': 0}}], '_delete': []}, 'rnap_id': 4, 'domains': {0: <class 'vivarium.states.chromosome.Domain'>: {'id': 0, 'lead': 0, 'lag': 0, 'children': []}}, 'root_domain': 0} ''' if not initial_parameters: initial_parameters = {} log.debug('inital transcription parameters: {}'.format(initial_parameters)) super(Transcription, self).__init__(initial_parameters) self.derive_defaults('templates', 'promoter_order', keys_list) self.derive_defaults('templates', 'transcript_ids', template_products) self.sequence = self.parameters['sequence'] self.templates = self.parameters['templates'] self.genes = self.parameters['genes'] empty_chromosome = Chromosome({ 'sequence': self.sequence, 'promoters': self.templates, 'genes': self.genes}) self.sequences = empty_chromosome.sequences() self.symbol_to_monomer = self.parameters['symbol_to_monomer'] log.debug('chromosome sequence: {}'.format(self.sequence)) self.promoter_affinities = self.parameters['promoter_affinities'] self.promoter_order = self.parameters['promoter_order'] self.promoter_count = len(self.promoter_order) self.transcription_factors = self.parameters['transcription_factors'] self.molecule_ids = self.parameters['molecule_ids'] self.molecule_ids.extend(['ATP', 'ADP']) self.monomer_ids = self.parameters['monomer_ids'] self.transcript_ids = self.parameters['transcript_ids'] self.elongation = 0 self.elongation_rate = self.parameters['elongation_rate'] self.polymerase_occlusion = self.parameters['polymerase_occlusion'] self.stoichiometry = build_stoichiometry(self.promoter_count) self.initiation = StochasticSystem(self.stoichiometry, random_seed=np.random.randint(2**31)) self.protein_ids = [UNBOUND_RNAP_KEY] + self.transcription_factors self.initial_domains = self.parameters['initial_domains'] self.concentrations_deriver_key = self.parameters['concentrations_deriver_key'] self.chromosome_ports = ['rnaps', 'rnap_id', 'domains', 'root_domain'] log.debug('final transcription parameters: {}'.format(self.parameters))
[docs] def build_affinity_vector(self, promoters, factors): vector = np.zeros(len(self.promoter_order), dtype=np.float64) for index, promoter_key in enumerate(self.promoter_order): promoter = promoters[promoter_key] binding = promoter.binding_state(factors) affinity = self.promoter_affinities.get(binding, 0.0) # print('promoter state - {}: {}'.format(binding, affinity)) vector[index] = affinity return vector
[docs] def chromosome_config(self, chromosome_states): return dict( chromosome_states, sequence=self.sequence, promoters=self.templates, promoter_order=self.promoter_order, genes=self.genes)
[docs] def ports_schema(self): schema = {} schema['chromosome'] = { 'rnap_id': { '_default': 1, '_updater': 'set'}, 'root_domain': { '_default': 0, '_updater': 'set'}, 'domains': { '*': { 'id': { '_default': 1, '_updater': 'set'}, 'lead': { '_default': 0, '_updater': 'set'}, 'lag': { '_default': 0, '_updater': 'set'}, 'children': { '_default': [], '_updater': 'set'}}}, 'rnaps': { '*': { 'id': { '_default': -1, '_updater': 'set'}, 'domain': { '_default': 0, '_updater': 'set'}, 'state': { '_default': None, '_updater': 'set', '_emit': True}, 'position': { '_default': 0, '_updater': 'set', '_emit': True}, 'template': { '_default': None, '_updater': 'set', '_emit': True}, 'template_index': { '_default': 0, '_updater': 'set', '_emit': True}, 'terminator': { '_default': 0, '_updater': 'set', '_emit': True}}}} initial_domains = { id: { 'id': { '_default': id, '_updater': 'set'}, 'lead': { '_default': 0, '_updater': 'set'}, 'lag': { '_default': 0, '_updater': 'set'}, 'children': { '_default': [], '_updater': 'set'}} for id, domain in self.initial_domains.items()} schema['chromosome']['domains'].update(initial_domains) schema['molecules'] = { molecule: { '_default': 0, '_divider': 'split', '_emit': True} for molecule in self.molecule_ids} schema['factors'] = { factor: { '_default': 0.0, '_divider': 'split'} for factor in self.transcription_factors} schema['transcripts'] = { protein: { '_default': 0, '_divider': 'split', '_emit': True} for protein in self.transcript_ids} schema['proteins'] = { protein: { '_default': 0, '_divider': 'split', '_emit': True} for protein in self.protein_ids} schema['global'] = {} return schema
[docs] def derivers(self): return { self.concentrations_deriver_key: { 'deriver': 'concentrations_deriver', 'port_mapping': { 'global': 'global', 'counts': 'proteins', 'concentrations': 'factors'}, 'config': { 'concentration_keys': self.transcription_factors}}}
[docs] def next_update(self, timestep, states): chromosome_state = states['chromosome'] # chromosome_state['rnaps'] = list(chromosome_state['rnaps'].values()) original_rnap_keys = [ rnap['id'] for rnap in chromosome_state['rnaps'].values()] chromosome = Chromosome( self.chromosome_config(chromosome_state)) molecules = states['molecules'] proteins = states['proteins'] factors = states['factors'] # as concentrations promoter_rnaps = chromosome.promoter_rnaps() promoter_domains = chromosome.promoter_domains() # Find out how many promoters are currently blocked by a # newly initiated or occluding rnap promoter_count = len(chromosome.promoter_order) blocked_promoters = np.zeros(promoter_count, dtype=np.int64) open_domains = {} bound_domains = {} for promoter_index, promoter_key in enumerate(chromosome.promoter_order): domains = [] for rnap in promoter_rnaps.get(promoter_key, {}).values(): if rnap.is_occluding(): domains.append(rnap.domain) blocked_promoters[promoter_index] += 1 bound_domains[promoter_key] = set(domains) open_domains[promoter_key] = promoter_domains[promoter_key] - bound_domains[promoter_key] blocked_promoters = np.array(blocked_promoters) # Make the state for a gillespie simulation out of total number of each # promoter by copy number not blocked by initiated rnap, # concatenated with the number of each promoter that is bound by rnap. # These are the two states for each promoter the simulation # will operate on, essentially going back and forth between # bound and unbound states. copy_numbers = chromosome.promoter_copy_numbers() original_unbound_rnaps = proteins[UNBOUND_RNAP_KEY] monomer_limits = { monomer: molecules[monomer] for monomer in self.monomer_ids} unbound_rnaps = original_unbound_rnaps time = 0 now = 0 elongation = Elongation( self.sequences, chromosome.promoters, monomer_limits, self.symbol_to_monomer, self.elongation) initiation_affinity = self.build_affinity_vector(chromosome.promoters, factors) while time < timestep: # build the state vector for the gillespie simulation substrate = np.concatenate([ copy_numbers - blocked_promoters, blocked_promoters, [unbound_rnaps]]) log.debug('transcription substrate: {}'.format(substrate)) log.debug('blocked promoters: {}'.format(blocked_promoters)) # find number of monomers until next terminator distance = 1 / self.elongation_rate # chromosome.terminator_distance() # find interval of time that elongates to the point of the next terminator interval = min(distance, timestep - time) if interval == distance: # perform the elongation until the next event terminations, monomer_limits, chromosome.rnaps = elongation.step( interval, monomer_limits, chromosome.rnaps) unbound_rnaps += terminations else: elongation.store_partial(interval) terminations = 0 log.debug('time: {} --- interval: {}'.format(time, interval)) log.debug('monomer limits: {}'.format(monomer_limits)) log.debug('terminations: {}'.format(terminations)) # run simulation for interval of time to next terminator result = self.initiation.evolve( interval, substrate, initiation_affinity) log.debug('result: {}'.format(result)) # perform binding for now, event in zip(result['time'], result['events']): # RNAP has bound the promoter promoter_key = chromosome.promoter_order[event] promoter = chromosome.promoters[promoter_key] domains = open_domains[promoter_key] domain = choose_element(domains) blocked_promoters[event] += 1 bound_domains[promoter_key].add(domain) open_domains[promoter_key].remove(domain) # create a new bound RNAP and add it to the chromosome. new_rnap = chromosome.bind_rnap(event, domain) new_rnap.start_polymerizing() log.debug('newly bound RNAP: {}'.format(new_rnap)) unbound_rnaps -= 1 # deal with occluding rnap for rnap in chromosome.rnaps.values(): if rnap.is_unoccluding(self.polymerase_occlusion): log.debug('RNAP unoccluding: {}'.format(rnap)) blocked_promoters[rnap.template_index] -= 1 bound_domains[rnap.template].remove(rnap.domain) open_domains[rnap.template].add(rnap.domain) rnap.unocclude() log.debug('rnap: {}'.format(rnap)) log.debug('complete: {}'.format(elongation.complete_polymers)) time += interval # track how far elongation proceeded to start from next iteration self.elongation = elongation.elongation - int(elongation.elongation) proteins = { UNBOUND_RNAP_KEY: unbound_rnaps - original_unbound_rnaps} molecules = { key: count * -1 for key, count in elongation.monomers.items()} # 1 ATP hydrolysis cost per nucleotide elongation molecules['ATP'] = 0 molecules['ADP'] = 0 for count in elongation.monomers.values(): molecules['ATP'] -= count molecules['ADP'] += count chromosome_dict = chromosome.to_dict() rnaps = chromosome_dict['rnaps'] original = set(original_rnap_keys) current = set(rnaps.keys()) bound_rnaps = current - original completed_rnaps = original - current continuing_rnaps = original - completed_rnaps rnap_updates = { rnap_id: rnaps[rnap_id] for rnap_id in continuing_rnaps} add_rnaps = [ {'path': (bound,), 'state': rnaps[bound]} for bound in bound_rnaps] delete_rnaps = [ (completed,) for completed in completed_rnaps] rnap_updates['_add'] = add_rnaps rnap_updates['_delete'] = delete_rnaps chromosome_dict['rnaps'] = rnap_updates update = { 'chromosome': { key: chromosome_dict[key] for key in self.chromosome_ports}, 'proteins': proteins, 'molecules': molecules, 'transcripts': elongation.complete_polymers} log.debug('molecules update: {}'.format(update['molecules'])) return update
[docs]def test_transcription(): parameters = { 'sequence': toy_chromosome_config['sequence'], 'templates': toy_chromosome_config['promoters'], 'genes': toy_chromosome_config['genes'], 'promoter_affinities': toy_chromosome_config['promoter_affinities'], 'transcription_factors': ['tfA', 'tfB'], 'elongation_rate': 10.0} chromosome = Chromosome(toy_chromosome_config) transcription = Transcription(parameters) initial_molecules = { nucleotide: 10 for nucleotide in transcription.monomer_ids} initial_molecules['ATP'] = 100000 experiment = process_in_experiment(transcription, { 'initial_state': { 'chromosome': chromosome.to_dict(), 'molecules': initial_molecules, 'proteins': {UNBOUND_RNAP_KEY: 10}, 'factors': {'tfA': 0.2, 'tfB': 0.7}}}) pp(experiment.state.get_value()) experiment.update(10.0) pp(experiment.state.get_value()) print('complete!')
if __name__ == '__main__': test_transcription()