Sequence annotation
In [1]:
Copied!
import sys
sys.path.insert(0, '../../')
import picea
from picea import SequenceAnnotation
picea.__version__
import sys
sys.path.insert(0, '../../')
import picea
from picea import SequenceAnnotation
picea.__version__
Out[1]:
'0.0.31'
In [2]:
Copied!
gff3 = (
# '##gff-version 3.1.26\n'
# '##sequence-region ctg123 1 1497228\n'
'ctg123\t.\tgene\t1000\t9000\t.\t+\t.\tID=gene00001;Name=EDEN\n'
'ctg123\t.\tTF_binding_site\t1000\t1012\t.\t+\t.\tID=tfbs00001;Parent=gene00001\n' # noqa
'ctg123\t.\tmRNA\t1050\t9000\t.\t+\t.\tID=mRNA00001;Parent=gene00001;Name=EDEN.1\n' # noqa
'ctg123\t.\tmRNA\t1050\t9000\t.\t+\t.\tID=mRNA00002;Parent=gene00001;Name=EDEN.2\n' # noqa
'ctg123\t.\tmRNA\t1300\t9000\t.\t+\t.\tID=mRNA00003;Parent=gene00001;Name=EDEN.3\n' # noqa
'ctg123\t.\texon\t1300\t1500\t.\t+\t.\tID=exon00001;Parent=mRNA00003\n'
'ctg123\t.\texon\t1050\t1500\t.\t+\t.\tID=exon00002;Parent=mRNA00001,mRNA00002\n' # noqa
'ctg123\t.\texon\t3000\t3902\t.\t+\t.\tID=exon00003;Parent=mRNA00001,mRNA00003\n' # noqa
'ctg123\t.\texon\t5000\t5500\t.\t+\t.\tID=exon00004;Parent=mRNA00001,mRNA00002,mRNA00003\n' # noqa
'ctg123\t.\texon\t7000\t9000\t.\t+\t.\tID=exon00005;Parent=mRNA00001,mRNA00002,mRNA00003\n' # noqa
'ctg123\t.\tCDS\t1201\t1500\t.\t+\t0\tID=cds00001.1;Parent=mRNA00001;Name=edenprotein.1\n' # noqa
'ctg123\t.\tCDS\t3000\t3902\t.\t+\t0\tID=cds00001.2;Parent=mRNA00001;Name=edenprotein.1\n' # noqa
'ctg123\t.\tCDS\t5000\t5500\t.\t+\t0\tID=cds00001.3;Parent=mRNA00001;Name=edenprotein.1\n' # noqa
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t0\tID=cds00001.4;Parent=mRNA00001;Name=edenprotein.1\n' # noqa
'ctg123\t.\tCDS\t1201\t1500\t.\t+\t0\tID=cds00002.1;Parent=mRNA00002;Name=edenprotein.2\n' # noqa
'ctg123\t.\tCDS\t5000\t5500\t.\t+\t0\tID=cds00002.2;Parent=mRNA00002;Name=edenprotein.2\n' # noqa
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t0\tID=cds00002.3;Parent=mRNA00002;Name=edenprotein.2\n' # noqa
'ctg123\t.\tCDS\t3301\t3902\t.\t+\t0\tID=cds00003.1;Parent=mRNA00003;Name=edenprotein.3\n' # noqa
'ctg123\t.\tCDS\t5000\t5500\t.\t+\t1\tID=cds00003.2;Parent=mRNA00003;Name=edenprotein.3\n' # noqa
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t1\tID=cds00003.3;Parent=mRNA00003;Name=edenprotein.3\n' # noqa
'ctg123\t.\tCDS\t3391\t3902\t.\t+\t0\tID=cds00004.1;Parent=mRNA00003;Name=edenprotein.4\n' # noqa
'ctg123\t.\tCDS\t5000\t5500\t.\t+\t1\tID=cds00004.2;Parent=mRNA00003;Name=edenprotein.4\n' # noqa
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t1\tID=cds00004.3;Parent=mRNA00003;Name=edenprotein.4\n' # noqa
)
ann = SequenceAnnotation.from_gff(string=gff3)
ann['mRNA00003'].parents.elements
gff3 = (
# '##gff-version 3.1.26\n'
# '##sequence-region ctg123 1 1497228\n'
'ctg123\t.\tgene\t1000\t9000\t.\t+\t.\tID=gene00001;Name=EDEN\n'
'ctg123\t.\tTF_binding_site\t1000\t1012\t.\t+\t.\tID=tfbs00001;Parent=gene00001\n' # noqa
'ctg123\t.\tmRNA\t1050\t9000\t.\t+\t.\tID=mRNA00001;Parent=gene00001;Name=EDEN.1\n' # noqa
'ctg123\t.\tmRNA\t1050\t9000\t.\t+\t.\tID=mRNA00002;Parent=gene00001;Name=EDEN.2\n' # noqa
'ctg123\t.\tmRNA\t1300\t9000\t.\t+\t.\tID=mRNA00003;Parent=gene00001;Name=EDEN.3\n' # noqa
'ctg123\t.\texon\t1300\t1500\t.\t+\t.\tID=exon00001;Parent=mRNA00003\n'
'ctg123\t.\texon\t1050\t1500\t.\t+\t.\tID=exon00002;Parent=mRNA00001,mRNA00002\n' # noqa
'ctg123\t.\texon\t3000\t3902\t.\t+\t.\tID=exon00003;Parent=mRNA00001,mRNA00003\n' # noqa
'ctg123\t.\texon\t5000\t5500\t.\t+\t.\tID=exon00004;Parent=mRNA00001,mRNA00002,mRNA00003\n' # noqa
'ctg123\t.\texon\t7000\t9000\t.\t+\t.\tID=exon00005;Parent=mRNA00001,mRNA00002,mRNA00003\n' # noqa
'ctg123\t.\tCDS\t1201\t1500\t.\t+\t0\tID=cds00001.1;Parent=mRNA00001;Name=edenprotein.1\n' # noqa
'ctg123\t.\tCDS\t3000\t3902\t.\t+\t0\tID=cds00001.2;Parent=mRNA00001;Name=edenprotein.1\n' # noqa
'ctg123\t.\tCDS\t5000\t5500\t.\t+\t0\tID=cds00001.3;Parent=mRNA00001;Name=edenprotein.1\n' # noqa
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t0\tID=cds00001.4;Parent=mRNA00001;Name=edenprotein.1\n' # noqa
'ctg123\t.\tCDS\t1201\t1500\t.\t+\t0\tID=cds00002.1;Parent=mRNA00002;Name=edenprotein.2\n' # noqa
'ctg123\t.\tCDS\t5000\t5500\t.\t+\t0\tID=cds00002.2;Parent=mRNA00002;Name=edenprotein.2\n' # noqa
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t0\tID=cds00002.3;Parent=mRNA00002;Name=edenprotein.2\n' # noqa
'ctg123\t.\tCDS\t3301\t3902\t.\t+\t0\tID=cds00003.1;Parent=mRNA00003;Name=edenprotein.3\n' # noqa
'ctg123\t.\tCDS\t5000\t5500\t.\t+\t1\tID=cds00003.2;Parent=mRNA00003;Name=edenprotein.3\n' # noqa
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t1\tID=cds00003.3;Parent=mRNA00003;Name=edenprotein.3\n' # noqa
'ctg123\t.\tCDS\t3391\t3902\t.\t+\t0\tID=cds00004.1;Parent=mRNA00003;Name=edenprotein.4\n' # noqa
'ctg123\t.\tCDS\t5000\t5500\t.\t+\t1\tID=cds00004.2;Parent=mRNA00003;Name=edenprotein.4\n' # noqa
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t1\tID=cds00004.3;Parent=mRNA00003;Name=edenprotein.4\n' # noqa
)
ann = SequenceAnnotation.from_gff(string=gff3)
ann['mRNA00003'].parents.elements
Out[2]:
[<SequenceInterval type=gene ID=gene00001 loc=ctg123..1000..9000..+ at 0x7fe1dfe23910>]
In [3]:
Copied!
ann['mRNA00003'].children.elements
ann['mRNA00003'].children.elements
Out[3]:
[<SequenceInterval type=exon ID=exon00001 loc=ctg123..1300..1500..+ at 0x7fe1dfe23850>, <SequenceInterval type=exon ID=exon00003 loc=ctg123..3000..3902..+ at 0x7fe1dfe22170>, <SequenceInterval type=exon ID=exon00004 loc=ctg123..5000..5500..+ at 0x7fe1dfe23160>, <SequenceInterval type=exon ID=exon00005 loc=ctg123..7000..9000..+ at 0x7fe1dfe23100>, <SequenceInterval type=CDS ID=cds00003.1 loc=ctg123..3301..3902..+ at 0x7fe1dfe22ec0>, <SequenceInterval type=CDS ID=cds00003.2 loc=ctg123..5000..5500..+ at 0x7fe1dfe22860>, <SequenceInterval type=CDS ID=cds00003.3 loc=ctg123..7000..7600..+ at 0x7fe1dfe22890>, <SequenceInterval type=CDS ID=cds00004.1 loc=ctg123..3391..3902..+ at 0x7fe1dfe22b00>, <SequenceInterval type=CDS ID=cds00004.2 loc=ctg123..5000..5500..+ at 0x7fe1dfe238b0>, <SequenceInterval type=CDS ID=cds00004.3 loc=ctg123..7000..7600..+ at 0x7fe1dfe23310>]
In [4]:
Copied!
ann['cds00004.3'].gff_attributes
ann['cds00004.3'].gff_attributes
Out[4]:
{'name': ['edenprotein.4'], 'ID': ['cds00004.3'], 'Parent': ['mRNA00003']}
In [5]:
Copied!
ann['cds00004.3'].to_gff_line()
ann['cds00004.3'].to_gff_line()
Out[5]:
'ctg123\t.\tCDS\t7000\t7600\t.\t+\t1\tID=cds00004.3;Parent=mRNA00003;Name=edenprotein.4'