Tree

In [1]:

Copied!





import sys
sys.path.insert(0, '../../')
import picea
from picea import Tree, treeplot
from matplotlib import pyplot as plt
picea.__version__
import sys
sys.path.insert(0, '../../')
import picea
from picea import Tree, treeplot
from matplotlib import pyplot as plt
picea.__version__

Out[1]:

'0.0.31'

In [2]:

Copied!

!pwd
!pwd

/home/runner/work/picea/picea/docs/examples

In [3]:

Copied!





tree = Tree.from_newick(filename='./data/tree.newick')

fig, [[ax1,ax2],[ax3,ax4]] = plt.subplots(ncols=2, nrows=2, figsize=(20,20))

treeplot(tree, style='square', ax=ax1)
treeplot(tree, style='triangular', ltr=False, ax=ax2)
treeplot(tree, style='square', branchlengths=False, ax=ax3)
treeplot(tree, style='radial', ax=ax4)
tree = Tree.from_newick(filename='./data/tree.newick')

fig, [[ax1,ax2],[ax3,ax4]] = plt.subplots(ncols=2, nrows=2, figsize=(20,20))

treeplot(tree, style='square', ax=ax1)
treeplot(tree, style='triangular', ltr=False, ax=ax2)
treeplot(tree, style='square', branchlengths=False, ax=ax3)
treeplot(tree, style='radial', ax=ax4)

/home/runner/work/picea/picea/docs/examples/../../picea/tree.py:177: UserWarning: Found branchlengths on some parts of the tree, but node 0 has no branchlength specified, setting to branchlength 0.0
  warn(

Out[3]:

<Axes: >

No description has been provided for this image

In [4]:

Copied!

0.4 / 25
0.4 / 25

Out[4]:

0.016

In [5]:

Copied!

x_min,x_max = ax3.get_xlim()
x_max - x_min, .1 * (x_max - x_min), (x_min,x_max)
x_min,x_max = ax3.get_xlim()
x_max - x_min, .1 * (x_max - x_min), (x_min,x_max)

Out[5]:

(25.09, 2.5090000000000003, (-0.52, 24.57))

In [6]:

Copied!





from sklearn.cluster import AgglomerativeClustering
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
              [4, 2], [4, 4], [4, 0]])
clustering = AgglomerativeClustering().fit(X)
clustering.labels_
from sklearn.cluster import AgglomerativeClustering
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
              [4, 2], [4, 4], [4, 0]])
clustering = AgglomerativeClustering().fit(X)
clustering.labels_

Out[6]:

array([1, 1, 1, 0, 0, 0])

In [7]:

Copied!

tree = Tree(children=[Tree(),Tree()])
for t in tree.depth_first():
    print(t.ID,t.name)
tree = Tree(children=[Tree(),Tree()])
for t in tree.depth_first():
    print(t.ID,t.name)

None None
None None
None None

In [8]:

Copied!

t.iloc[None]
t.iloc[None]

Out[8]:

Tree(name=None, length=None, children=[])

In [9]:

Copied!

tree = Tree.from_sklearn(clustering)
tree.to_newick(branch_lengths=False)
tree = Tree.from_sklearn(clustering)
tree.to_newick(branch_lengths=False)

Out[9]:

'((2,(0,1)),(4,(3,5)));'

In [10]:

Copied!

Tree(**tree.to_dict())
Tree(**tree.to_dict())

Out[10]:

Tree(name=None, length=None, children=[{'name': None, 'length': None, 'children': [{'name': '2', 'length': None, 'children': []}, {'name': None, 'length': None, 'children': [{'name': '0', 'length': None, 'children': []}, {'name': '1', 'length': None, 'children': []}]}]}, {'name': None, 'length': None, 'children': [{'name': '4', 'length': None, 'children': []}, {'name': None, 'length': None, 'children': [{'name': '3', 'length': None, 'children': []}, {'name': '5', 'length': None, 'children': []}]}]}])

In [11]:

Copied!

tree.iloc[1].name = 'long name'
tree.iloc[1].name = 'long name'

In [12]:

Copied!

print(tree.to_json(indent=2))
print(tree.to_json(indent=2))

{
  "name": null,
  "length": null,
  "children": [
    {
      "name": null,
      "length": null,
      "children": [
        {
          "name": "2",
          "length": null,
          "children": []
        },
        {
          "name": null,
          "length": null,
          "children": [
            {
              "name": "0",
              "length": null,
              "children": []
            },
            {
              "name": "long name",
              "length": null,
              "children": []
            }
          ]
        }
      ]
    },
    {
      "name": null,
      "length": null,
      "children": [
        {
          "name": "4",
          "length": null,
          "children": []
        },
        {
          "name": null,
          "length": null,
          "children": [
            {
              "name": "3",
              "length": null,
              "children": []
            },
            {
              "name": "5",
              "length": null,
              "children": []
            }
          ]
        }
      ]
    }
  ]
}

In [13]:

Copied!





from matplotlib import pyplot as plt
fig, [ax1, ax2, ax3] = plt.subplots(ncols=3,figsize=(15, 5))

picea.treeplot(tree, style='radial', ltr=False, ax=ax1)
picea.treeplot(tree, style='square', ltr=True, ax=ax2)
picea.treeplot(tree, style='triangular', ltr=False, ax=ax3)

for ax in (ax1,ax2,ax3):
    ax.scatter((0,0),(0,0),c='red')
from matplotlib import pyplot as plt
fig, [ax1, ax2, ax3] = plt.subplots(ncols=3,figsize=(15, 5))

picea.treeplot(tree, style='radial', ltr=False, ax=ax1)
picea.treeplot(tree, style='square', ltr=True, ax=ax2)
picea.treeplot(tree, style='triangular', ltr=False, ax=ax3)

for ax in (ax1,ax2,ax3):
    ax.scatter((0,0),(0,0),c='red')

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[13], line 4
      1 from matplotlib import pyplot as plt
      2 fig, [ax1, ax2, ax3] = plt.subplots(ncols=3,figsize=(15, 5))
----> 4 picea.treeplot(tree, style='radial', ltr=False, ax=ax1)
      5 picea.treeplot(tree, style='square', ltr=True, ax=ax2)
      6 picea.treeplot(tree, style='triangular', ltr=False, ax=ax3)

File ~/work/picea/picea/docs/examples/../../picea/tree.py:475, in treeplot(tree, style, branchlengths, ltr, node_labels, leaf_labels, leaf_marker, leaf_marker_fill, leaf_marker_edge, branch_linestyle, ax, return_layout)
    437 def treeplot(
    438     tree: Tree,
    439     style: TreeStyle = TreeStyle.square,
   (...)
    449     return_layout: bool = False,
    450 ) -> Union[Ax, Tuple[Ax, LayoutDict]]:
    451     """[summary]
    452 
    453     Args:
   (...)
    473         Union[Ax, Tuple[Ax, LayoutDict]]: [description]
    474     """
--> 475     layout = calculate_tree_layout(tree=tree, style=style, ltr=ltr, branchlengths=branchlengths)
    477     if not ax:
    478         _, ax = plt.subplots(figsize=(6, 6))

File ~/work/picea/picea/docs/examples/../../picea/tree.py:417, in calculate_tree_layout(tree, style, ltr, branchlengths)
    415         node_coords.x = increment + max(child_x_coords)
    416     else:
--> 417         node_coords.x = min(child_x_coords) - increment
    418 else:
    419     if previous_node:

TypeError: '<' not supported between instances of 'NoneType' and 'NoneType'

In [14]:

Copied!

fig, ax = plt.subplots(figsize=(10, 10))
picea.treeplot(tree, style='radial', ax=ax)

ax.scatter((0,0),(0,0),c='red')
fig, ax = plt.subplots(figsize=(10, 10))
picea.treeplot(tree, style='radial', ax=ax)

ax.scatter((0,0),(0,0),c='red')

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[14], line 2
      1 fig, ax = plt.subplots(figsize=(10, 10))
----> 2 picea.treeplot(tree, style='radial', ax=ax)
      4 ax.scatter((0,0),(0,0),c='red')

File ~/work/picea/picea/docs/examples/../../picea/tree.py:475, in treeplot(tree, style, branchlengths, ltr, node_labels, leaf_labels, leaf_marker, leaf_marker_fill, leaf_marker_edge, branch_linestyle, ax, return_layout)
    437 def treeplot(
    438     tree: Tree,
    439     style: TreeStyle = TreeStyle.square,
   (...)
    449     return_layout: bool = False,
    450 ) -> Union[Ax, Tuple[Ax, LayoutDict]]:
    451     """[summary]
    452 
    453     Args:
   (...)
    473         Union[Ax, Tuple[Ax, LayoutDict]]: [description]
    474     """
--> 475     layout = calculate_tree_layout(tree=tree, style=style, ltr=ltr, branchlengths=branchlengths)
    477     if not ax:
    478         _, ax = plt.subplots(figsize=(6, 6))

File ~/work/picea/picea/docs/examples/../../picea/tree.py:415, in calculate_tree_layout(tree, style, ltr, branchlengths)
    413 increment = node.length if branchlengths else 1.0
    414 if ltr:
--> 415     node_coords.x = increment + max(child_x_coords)
    416 else:
    417     node_coords.x = min(child_x_coords) - increment

TypeError: '>' not supported between instances of 'NoneType' and 'NoneType'

In [15]:

Copied!





import numpy as np
from dataclasses import dataclass

@dataclass
class TwoDCoordinate():
    x: float = 0.0
    y: float = 0.0
    
    def __iter__(self):
        yield from (self.x, self.y)
    
    def to_polar(self):
        return TwoDCoordinate(
            x = self.x * np.cos(self.y), 
            y = self.x * np.sin(self.y)
        )
    
    def to_cartesian(self):
        return TwoDCoordinate(
            x = np.sqrt(self.x ** 2 + self.y ** 2),
            y = np.arctan2(self.y, self.x)
        )

c = TwoDCoordinate(x=1, y=1)

c
import numpy as np
from dataclasses import dataclass

@dataclass
class TwoDCoordinate():
    x: float = 0.0
    y: float = 0.0
    
    def __iter__(self):
        yield from (self.x, self.y)
    
    def to_polar(self):
        return TwoDCoordinate(
            x = self.x * np.cos(self.y), 
            y = self.x * np.sin(self.y)
        )
    
    def to_cartesian(self):
        return TwoDCoordinate(
            x = np.sqrt(self.x ** 2 + self.y ** 2),
            y = np.arctan2(self.y, self.x)
        )

c = TwoDCoordinate(x=1, y=1)

c

Out[15]:

TwoDCoordinate(x=1, y=1)

In [16]:

Copied!

c.to_cartesian().to_polar()
c.to_cartesian().to_polar()

Out[16]:

TwoDCoordinate(x=1.0000000000000002, y=1.0)

In [17]:

Copied!

c.to_polar().to_cartesian()
c.to_polar().to_cartesian()

Out[17]:

TwoDCoordinate(x=1.0, y=1.0)

In [18]:

Copied!

grid = np.array([
    [TwoDCoordinate(x,y) for x in np.arange(0, 1.2, .2)] 
    for y in np.arange(0, np.pi, .1)
]).flatten()

fig,[ax1,ax2] = plt.subplots(ncols=2, figsize=(20,5))

ax1.scatter(*zip(*[[*p] for p in grid]))
ax2.scatter(*zip(*[[*p.to_polar()] for p in grid]))

points = np.array([TwoDCoordinate(x, x*2) for x in np.arange(0., 1.05, .05)])

ax1.scatter(*zip(*[[*p] for p in points]))
ax2.scatter(*zip(*[[*p.to_polar()] for p in points]))
grid = np.array([
    [TwoDCoordinate(x,y) for x in np.arange(0, 1.2, .2)] 
    for y in np.arange(0, np.pi, .1)
]).flatten()

fig,[ax1,ax2] = plt.subplots(ncols=2, figsize=(20,5))

ax1.scatter(*zip(*[[*p] for p in grid]))
ax2.scatter(*zip(*[[*p.to_polar()] for p in grid]))

points = np.array([TwoDCoordinate(x, x*2) for x in np.arange(0., 1.05, .05)])

ax1.scatter(*zip(*[[*p] for p in points]))
ax2.scatter(*zip(*[[*p.to_polar()] for p in points]))

Out[18]:

<matplotlib.collections.PathCollection at 0x7fc0a3f75c90>

In [19]:

Copied!





seq = picea.SequenceCollection.from_fasta(filename='./data/HCT.fasta')
_msa = seq.align()
msa = _msa._collection
msa.shape
seq = picea.SequenceCollection.from_fasta(filename='./data/HCT.fasta')
_msa = seq.align()
msa = _msa._collection
msa.shape

---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
Cell In[19], line 2
      1 seq = picea.SequenceCollection.from_fasta(filename='./data/HCT.fasta')
----> 2 _msa = seq.align()
      3 msa = _msa._collection
      4 msa.shape

File ~/work/picea/picea/docs/examples/../../picea/sequence.py:1576, in SequenceCollection.align(self, method, method_kwargs)
   1574 fasta = self.to_fasta()
   1575 command = [method, *chain(*method_kwargs.items()), "-"]
-> 1576 process = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE)
   1577 stdout, _ = process.communicate(input=fasta.encode())
   1578 aligned_fasta = stdout.decode().strip()

File /usr/lib/python3.10/subprocess.py:971, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize)
    967         if self.text_mode:
    968             self.stderr = io.TextIOWrapper(self.stderr,
    969                     encoding=encoding, errors=errors)
--> 971     self._execute_child(args, executable, preexec_fn, close_fds,
    972                         pass_fds, cwd, env,
    973                         startupinfo, creationflags, shell,
    974                         p2cread, p2cwrite,
    975                         c2pread, c2pwrite,
    976                         errread, errwrite,
    977                         restore_signals,
    978                         gid, gids, uid, umask,
    979                         start_new_session)
    980 except:
    981     # Cleanup if the child failed starting.
    982     for f in filter(None, (self.stdin, self.stdout, self.stderr)):

File /usr/lib/python3.10/subprocess.py:1863, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session)
   1861     if errno_num != 0:
   1862         err_msg = os.strerror(errno_num)
-> 1863     raise child_exception_type(errno_num, err_msg, err_filename)
   1864 raise child_exception_type(err_msg)

FileNotFoundError: [Errno 2] No such file or directory: 'mafft'

In [20]:

Copied!

import numpy as np
np.sum(v_equals(msa[...,None], msa.T[None,...]),axis=1)
import numpy as np
np.sum(v_equals(msa[...,None], msa.T[None,...]),axis=1)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[20], line 2
      1 import numpy as np
----> 2 np.sum(v_equals(msa[...,None], msa.T[None,...]),axis=1)

NameError: name 'v_equals' is not defined

In [21]:

Copied!

np.sum(np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]),axis=1)
np.sum(np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]),axis=1)

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[21], line 1
----> 1 np.sum(np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]),axis=1)

NameError: name 'msa' is not defined

In [22]:

Copied!

np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]).shape
np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]).shape

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[22], line 1
----> 1 np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]).shape

NameError: name 'msa' is not defined

In [23]:

Copied!

msa[...,np.newaxis].shape, msa.T[np.newaxis,...].shape
msa[...,np.newaxis].shape, msa.T[np.newaxis,...].shape

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[23], line 1
----> 1 msa[...,np.newaxis].shape, msa.T[np.newaxis,...].shape

NameError: name 'msa' is not defined

In [24]:

Copied!





#a = np.random.rand(3,4)
#b = a.T

a = msa
b = msa.T

def equals(x,y):
    return x == y

def lt(x,y):
    return x<y

def subst(x,y):
    #print(x,y)
    return substitution_scores[x][y]

v_equals = np.vectorize(equals)
v_lt = np.vectorize(lt)
v_subst = np.vectorize(subst)

np.sum(v_subst(a[...,None], b[None,...]), axis=1)
#a = np.random.rand(3,4)
#b = a.T

a = msa
b = msa.T

def equals(x,y):
    return x == y

def lt(x,y):
    return x

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[24], line 4
      1 #a = np.random.rand(3,4)
      2 #b = a.T
----> 4 a = msa
      5 b = msa.T
      7 def equals(x,y):

NameError: name 'msa' is not defined

In [25]:

Copied!





blosum62_str = """
#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4 
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4 
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4 
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4 
C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4 
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 
G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4 
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4 
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4 
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4 
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4 
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4 
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4 
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4 
S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4 
T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4 
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4 
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4 
V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4 
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4 
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 
X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4 
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
"""
blosum62_str = """
#  Matrix made by matblas from blosum62.iij
#  * column uses minimum score
#  BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
#  Blocks Database = /data/blocks_5.0/blocks.dat
#  Cluster Percentage: >= 62
#  Entropy =   0.6979, Expected =  -0.5209
   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  B  Z  X  *
A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0 -2 -1  0 -4 
R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3 -1  0 -1 -4 
N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  3  0 -1 -4 
D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  4  1 -1 -4 
C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 
Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0  3 -1 -4 
E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 
G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3 -1 -2 -1 -4 
H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0  0 -1 -4 
I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3 -3 -3 -1 -4 
L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1 -4 -3 -1 -4 
K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0  1 -1 -4 
M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1 -3 -1 -1 -4 
F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1 -3 -3 -1 -4 
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2 -2 -1 -2 -4 
S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0  0  0 -4 
T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0 -1 -1  0 -4 
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3 -4 -3 -2 -4 
Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1 -3 -2 -1 -4 
V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4 -3 -2 -1 -4 
B -2 -1  3  4 -3  0  1 -1  0 -3 -4  0 -3 -3 -2  0 -1 -4 -3 -3  4  1 -1 -4 
Z -1  0  0  1 -3  3  4 -2  0 -3 -3  1 -1 -3 -1  0 -1 -3 -2 -2  1  4 -1 -4 
X  0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2  0  0 -2 -1 -1 -1 -1 -1 -4 
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4  1
"""

In [26]:

Copied!





lines = (line for line in blosum62_str.split('\n') if line)
matrix_lines = (line.strip().split() for line in lines if line[0] != '#')

substitution_scores = defaultdict(lambda: defaultdict(int))
for idx,matrix_line in enumerate(matrix_lines):
    if idx == 0:
        aas = matrix_line
        encoded_aas = np.array([*''.join(aas).encode()], dtype=np.uint8)
        aa_encoding = dict(zip(aas, encoded_aas))
        continue
    [aa,*scores] = matrix_line
    substitution_scores[aa_encoding[aa]].update(dict(zip(encoded_aas, scores)))
substitution_scores.keys()
lines = (line for line in blosum62_str.split('\n') if line)
matrix_lines = (line.strip().split() for line in lines if line[0] != '#')

substitution_scores = defaultdict(lambda: defaultdict(int))
for idx,matrix_line in enumerate(matrix_lines):
    if idx == 0:
        aas = matrix_line
        encoded_aas = np.array([*''.join(aas).encode()], dtype=np.uint8)
        aa_encoding = dict(zip(aas, encoded_aas))
        continue
    [aa,*scores] = matrix_line
    substitution_scores[aa_encoding[aa]].update(dict(zip(encoded_aas, scores)))
substitution_scores.keys()
    

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[26], line 4
      1 lines = (line for line in blosum62_str.split('\n') if line)
      2 matrix_lines = (line.strip().split() for line in lines if line[0] != '#')
----> 4 substitution_scores = defaultdict(lambda: defaultdict(int))
      5 for idx,matrix_line in enumerate(matrix_lines):
      6     if idx == 0:

NameError: name 'defaultdict' is not defined

In [27]:

Copied!

d = defaultdict(lambda: defaultdict(int))
d[0].update(dict(a=1))
d
d = defaultdict(lambda: defaultdict(int))
d[0].update(dict(a=1))
d

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[27], line 1
----> 1 d = defaultdict(lambda: defaultdict(int))
      2 d[0].update(dict(a=1))
      3 d

NameError: name 'defaultdict' is not defined

In [28]:

Copied!

aa_encoding
aa_encoding

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[28], line 1
----> 1 aa_encoding

NameError: name 'aa_encoding' is not defined

In [29]:

Copied!

np.array([45],dtype=np.uint8).view('S1')[0].decode()
np.array([45],dtype=np.uint8).view('S1')[0].decode()

Out[29]:

'-'