Tree
In [1]:
Copied!
import sys
sys.path.insert(0, '../../')
import picea
from picea import Tree, treeplot
from matplotlib import pyplot as plt
picea.__version__
import sys
sys.path.insert(0, '../../')
import picea
from picea import Tree, treeplot
from matplotlib import pyplot as plt
picea.__version__
Out[1]:
'0.0.31'
In [2]:
Copied!
!pwd
!pwd
/home/runner/work/picea/picea/docs/examples
In [3]:
Copied!
tree = Tree.from_newick(filename='./data/tree.newick')
fig, [[ax1,ax2],[ax3,ax4]] = plt.subplots(ncols=2, nrows=2, figsize=(20,20))
treeplot(tree, style='square', ax=ax1)
treeplot(tree, style='triangular', ltr=False, ax=ax2)
treeplot(tree, style='square', branchlengths=False, ax=ax3)
treeplot(tree, style='radial', ax=ax4)
tree = Tree.from_newick(filename='./data/tree.newick')
fig, [[ax1,ax2],[ax3,ax4]] = plt.subplots(ncols=2, nrows=2, figsize=(20,20))
treeplot(tree, style='square', ax=ax1)
treeplot(tree, style='triangular', ltr=False, ax=ax2)
treeplot(tree, style='square', branchlengths=False, ax=ax3)
treeplot(tree, style='radial', ax=ax4)
/home/runner/work/picea/picea/docs/examples/../../picea/tree.py:177: UserWarning: Found branchlengths on some parts of the tree, but node 0 has no branchlength specified, setting to branchlength 0.0 warn(
Out[3]:
<Axes: >
In [4]:
Copied!
0.4 / 25
0.4 / 25
Out[4]:
0.016
In [5]:
Copied!
x_min,x_max = ax3.get_xlim()
x_max - x_min, .1 * (x_max - x_min), (x_min,x_max)
x_min,x_max = ax3.get_xlim()
x_max - x_min, .1 * (x_max - x_min), (x_min,x_max)
Out[5]:
(25.09, 2.5090000000000003, (-0.52, 24.57))
In [6]:
Copied!
from sklearn.cluster import AgglomerativeClustering
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
[4, 2], [4, 4], [4, 0]])
clustering = AgglomerativeClustering().fit(X)
clustering.labels_
from sklearn.cluster import AgglomerativeClustering
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
[4, 2], [4, 4], [4, 0]])
clustering = AgglomerativeClustering().fit(X)
clustering.labels_
Out[6]:
array([1, 1, 1, 0, 0, 0])
In [7]:
Copied!
tree = Tree(children=[Tree(),Tree()])
for t in tree.depth_first():
print(t.ID,t.name)
tree = Tree(children=[Tree(),Tree()])
for t in tree.depth_first():
print(t.ID,t.name)
None None None None None None
In [8]:
Copied!
t.iloc[None]
t.iloc[None]
Out[8]:
Tree(name=None, length=None, children=[])
In [9]:
Copied!
tree = Tree.from_sklearn(clustering)
tree.to_newick(branch_lengths=False)
tree = Tree.from_sklearn(clustering)
tree.to_newick(branch_lengths=False)
Out[9]:
'((2,(0,1)),(4,(3,5)));'
In [10]:
Copied!
Tree(**tree.to_dict())
Tree(**tree.to_dict())
Out[10]:
Tree(name=None, length=None, children=[{'name': None, 'length': None, 'children': [{'name': '2', 'length': None, 'children': []}, {'name': None, 'length': None, 'children': [{'name': '0', 'length': None, 'children': []}, {'name': '1', 'length': None, 'children': []}]}]}, {'name': None, 'length': None, 'children': [{'name': '4', 'length': None, 'children': []}, {'name': None, 'length': None, 'children': [{'name': '3', 'length': None, 'children': []}, {'name': '5', 'length': None, 'children': []}]}]}])
In [11]:
Copied!
tree.iloc[1].name = 'long name'
tree.iloc[1].name = 'long name'
In [12]:
Copied!
print(tree.to_json(indent=2))
print(tree.to_json(indent=2))
{ "name": null, "length": null, "children": [ { "name": null, "length": null, "children": [ { "name": "2", "length": null, "children": [] }, { "name": null, "length": null, "children": [ { "name": "0", "length": null, "children": [] }, { "name": "long name", "length": null, "children": [] } ] } ] }, { "name": null, "length": null, "children": [ { "name": "4", "length": null, "children": [] }, { "name": null, "length": null, "children": [ { "name": "3", "length": null, "children": [] }, { "name": "5", "length": null, "children": [] } ] } ] } ] }
In [13]:
Copied!
from matplotlib import pyplot as plt
fig, [ax1, ax2, ax3] = plt.subplots(ncols=3,figsize=(15, 5))
picea.treeplot(tree, style='radial', ltr=False, ax=ax1)
picea.treeplot(tree, style='square', ltr=True, ax=ax2)
picea.treeplot(tree, style='triangular', ltr=False, ax=ax3)
for ax in (ax1,ax2,ax3):
ax.scatter((0,0),(0,0),c='red')
from matplotlib import pyplot as plt
fig, [ax1, ax2, ax3] = plt.subplots(ncols=3,figsize=(15, 5))
picea.treeplot(tree, style='radial', ltr=False, ax=ax1)
picea.treeplot(tree, style='square', ltr=True, ax=ax2)
picea.treeplot(tree, style='triangular', ltr=False, ax=ax3)
for ax in (ax1,ax2,ax3):
ax.scatter((0,0),(0,0),c='red')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[13], line 4 1 from matplotlib import pyplot as plt 2 fig, [ax1, ax2, ax3] = plt.subplots(ncols=3,figsize=(15, 5)) ----> 4 picea.treeplot(tree, style='radial', ltr=False, ax=ax1) 5 picea.treeplot(tree, style='square', ltr=True, ax=ax2) 6 picea.treeplot(tree, style='triangular', ltr=False, ax=ax3) File ~/work/picea/picea/docs/examples/../../picea/tree.py:475, in treeplot(tree, style, branchlengths, ltr, node_labels, leaf_labels, leaf_marker, leaf_marker_fill, leaf_marker_edge, branch_linestyle, ax, return_layout) 437 def treeplot( 438 tree: Tree, 439 style: TreeStyle = TreeStyle.square, (...) 449 return_layout: bool = False, 450 ) -> Union[Ax, Tuple[Ax, LayoutDict]]: 451 """[summary] 452 453 Args: (...) 473 Union[Ax, Tuple[Ax, LayoutDict]]: [description] 474 """ --> 475 layout = calculate_tree_layout(tree=tree, style=style, ltr=ltr, branchlengths=branchlengths) 477 if not ax: 478 _, ax = plt.subplots(figsize=(6, 6)) File ~/work/picea/picea/docs/examples/../../picea/tree.py:417, in calculate_tree_layout(tree, style, ltr, branchlengths) 415 node_coords.x = increment + max(child_x_coords) 416 else: --> 417 node_coords.x = min(child_x_coords) - increment 418 else: 419 if previous_node: TypeError: '<' not supported between instances of 'NoneType' and 'NoneType'
In [14]:
Copied!
fig, ax = plt.subplots(figsize=(10, 10))
picea.treeplot(tree, style='radial', ax=ax)
ax.scatter((0,0),(0,0),c='red')
fig, ax = plt.subplots(figsize=(10, 10))
picea.treeplot(tree, style='radial', ax=ax)
ax.scatter((0,0),(0,0),c='red')
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[14], line 2 1 fig, ax = plt.subplots(figsize=(10, 10)) ----> 2 picea.treeplot(tree, style='radial', ax=ax) 4 ax.scatter((0,0),(0,0),c='red') File ~/work/picea/picea/docs/examples/../../picea/tree.py:475, in treeplot(tree, style, branchlengths, ltr, node_labels, leaf_labels, leaf_marker, leaf_marker_fill, leaf_marker_edge, branch_linestyle, ax, return_layout) 437 def treeplot( 438 tree: Tree, 439 style: TreeStyle = TreeStyle.square, (...) 449 return_layout: bool = False, 450 ) -> Union[Ax, Tuple[Ax, LayoutDict]]: 451 """[summary] 452 453 Args: (...) 473 Union[Ax, Tuple[Ax, LayoutDict]]: [description] 474 """ --> 475 layout = calculate_tree_layout(tree=tree, style=style, ltr=ltr, branchlengths=branchlengths) 477 if not ax: 478 _, ax = plt.subplots(figsize=(6, 6)) File ~/work/picea/picea/docs/examples/../../picea/tree.py:415, in calculate_tree_layout(tree, style, ltr, branchlengths) 413 increment = node.length if branchlengths else 1.0 414 if ltr: --> 415 node_coords.x = increment + max(child_x_coords) 416 else: 417 node_coords.x = min(child_x_coords) - increment TypeError: '>' not supported between instances of 'NoneType' and 'NoneType'
In [15]:
Copied!
import numpy as np
from dataclasses import dataclass
@dataclass
class TwoDCoordinate():
x: float = 0.0
y: float = 0.0
def __iter__(self):
yield from (self.x, self.y)
def to_polar(self):
return TwoDCoordinate(
x = self.x * np.cos(self.y),
y = self.x * np.sin(self.y)
)
def to_cartesian(self):
return TwoDCoordinate(
x = np.sqrt(self.x ** 2 + self.y ** 2),
y = np.arctan2(self.y, self.x)
)
c = TwoDCoordinate(x=1, y=1)
c
import numpy as np
from dataclasses import dataclass
@dataclass
class TwoDCoordinate():
x: float = 0.0
y: float = 0.0
def __iter__(self):
yield from (self.x, self.y)
def to_polar(self):
return TwoDCoordinate(
x = self.x * np.cos(self.y),
y = self.x * np.sin(self.y)
)
def to_cartesian(self):
return TwoDCoordinate(
x = np.sqrt(self.x ** 2 + self.y ** 2),
y = np.arctan2(self.y, self.x)
)
c = TwoDCoordinate(x=1, y=1)
c
Out[15]:
TwoDCoordinate(x=1, y=1)
In [16]:
Copied!
c.to_cartesian().to_polar()
c.to_cartesian().to_polar()
Out[16]:
TwoDCoordinate(x=1.0000000000000002, y=1.0)
In [17]:
Copied!
c.to_polar().to_cartesian()
c.to_polar().to_cartesian()
Out[17]:
TwoDCoordinate(x=1.0, y=1.0)
In [18]:
Copied!
grid = np.array([
[TwoDCoordinate(x,y) for x in np.arange(0, 1.2, .2)]
for y in np.arange(0, np.pi, .1)
]).flatten()
fig,[ax1,ax2] = plt.subplots(ncols=2, figsize=(20,5))
ax1.scatter(*zip(*[[*p] for p in grid]))
ax2.scatter(*zip(*[[*p.to_polar()] for p in grid]))
points = np.array([TwoDCoordinate(x, x*2) for x in np.arange(0., 1.05, .05)])
ax1.scatter(*zip(*[[*p] for p in points]))
ax2.scatter(*zip(*[[*p.to_polar()] for p in points]))
grid = np.array([
[TwoDCoordinate(x,y) for x in np.arange(0, 1.2, .2)]
for y in np.arange(0, np.pi, .1)
]).flatten()
fig,[ax1,ax2] = plt.subplots(ncols=2, figsize=(20,5))
ax1.scatter(*zip(*[[*p] for p in grid]))
ax2.scatter(*zip(*[[*p.to_polar()] for p in grid]))
points = np.array([TwoDCoordinate(x, x*2) for x in np.arange(0., 1.05, .05)])
ax1.scatter(*zip(*[[*p] for p in points]))
ax2.scatter(*zip(*[[*p.to_polar()] for p in points]))
Out[18]:
<matplotlib.collections.PathCollection at 0x7fc0a3f75c90>
In [19]:
Copied!
seq = picea.SequenceCollection.from_fasta(filename='./data/HCT.fasta')
_msa = seq.align()
msa = _msa._collection
msa.shape
seq = picea.SequenceCollection.from_fasta(filename='./data/HCT.fasta')
_msa = seq.align()
msa = _msa._collection
msa.shape
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[19], line 2 1 seq = picea.SequenceCollection.from_fasta(filename='./data/HCT.fasta') ----> 2 _msa = seq.align() 3 msa = _msa._collection 4 msa.shape File ~/work/picea/picea/docs/examples/../../picea/sequence.py:1576, in SequenceCollection.align(self, method, method_kwargs) 1574 fasta = self.to_fasta() 1575 command = [method, *chain(*method_kwargs.items()), "-"] -> 1576 process = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE) 1577 stdout, _ = process.communicate(input=fasta.encode()) 1578 aligned_fasta = stdout.decode().strip() File /usr/lib/python3.10/subprocess.py:971, in Popen.__init__(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize) 967 if self.text_mode: 968 self.stderr = io.TextIOWrapper(self.stderr, 969 encoding=encoding, errors=errors) --> 971 self._execute_child(args, executable, preexec_fn, close_fds, 972 pass_fds, cwd, env, 973 startupinfo, creationflags, shell, 974 p2cread, p2cwrite, 975 c2pread, c2pwrite, 976 errread, errwrite, 977 restore_signals, 978 gid, gids, uid, umask, 979 start_new_session) 980 except: 981 # Cleanup if the child failed starting. 982 for f in filter(None, (self.stdin, self.stdout, self.stderr)): File /usr/lib/python3.10/subprocess.py:1863, in Popen._execute_child(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, restore_signals, gid, gids, uid, umask, start_new_session) 1861 if errno_num != 0: 1862 err_msg = os.strerror(errno_num) -> 1863 raise child_exception_type(errno_num, err_msg, err_filename) 1864 raise child_exception_type(err_msg) FileNotFoundError: [Errno 2] No such file or directory: 'mafft'
In [20]:
Copied!
import numpy as np
np.sum(v_equals(msa[...,None], msa.T[None,...]),axis=1)
import numpy as np
np.sum(v_equals(msa[...,None], msa.T[None,...]),axis=1)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[20], line 2 1 import numpy as np ----> 2 np.sum(v_equals(msa[...,None], msa.T[None,...]),axis=1) NameError: name 'v_equals' is not defined
In [21]:
Copied!
np.sum(np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]),axis=1)
np.sum(np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]),axis=1)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[21], line 1 ----> 1 np.sum(np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]),axis=1) NameError: name 'msa' is not defined
In [22]:
Copied!
np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]).shape
np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]).shape
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[22], line 1 ----> 1 np.equal(msa[...,np.newaxis], msa.T[np.newaxis,...]).shape NameError: name 'msa' is not defined
In [23]:
Copied!
msa[...,np.newaxis].shape, msa.T[np.newaxis,...].shape
msa[...,np.newaxis].shape, msa.T[np.newaxis,...].shape
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[23], line 1 ----> 1 msa[...,np.newaxis].shape, msa.T[np.newaxis,...].shape NameError: name 'msa' is not defined
In [24]:
Copied!
#a = np.random.rand(3,4)
#b = a.T
a = msa
b = msa.T
def equals(x,y):
return x == y
def lt(x,y):
return x<y
def subst(x,y):
#print(x,y)
return substitution_scores[x][y]
v_equals = np.vectorize(equals)
v_lt = np.vectorize(lt)
v_subst = np.vectorize(subst)
np.sum(v_subst(a[...,None], b[None,...]), axis=1)
#a = np.random.rand(3,4)
#b = a.T
a = msa
b = msa.T
def equals(x,y):
return x == y
def lt(x,y):
return x
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[24], line 4 1 #a = np.random.rand(3,4) 2 #b = a.T ----> 4 a = msa 5 b = msa.T 7 def equals(x,y): NameError: name 'msa' is not defined
In [25]:
Copied!
blosum62_str = """
# Matrix made by matblas from blosum62.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 62
# Entropy = 0.6979, Expected = -0.5209
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
"""
blosum62_str = """
# Matrix made by matblas from blosum62.iij
# * column uses minimum score
# BLOSUM Clustered Scoring Matrix in 1/2 Bit Units
# Blocks Database = /data/blocks_5.0/blocks.dat
# Cluster Percentage: >= 62
# Entropy = 0.6979, Expected = -0.5209
A R N D C Q E G H I L K M F P S T W Y V B Z X *
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4
B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4
Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4
X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4
* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1
"""
In [26]:
Copied!
lines = (line for line in blosum62_str.split('\n') if line)
matrix_lines = (line.strip().split() for line in lines if line[0] != '#')
substitution_scores = defaultdict(lambda: defaultdict(int))
for idx,matrix_line in enumerate(matrix_lines):
if idx == 0:
aas = matrix_line
encoded_aas = np.array([*''.join(aas).encode()], dtype=np.uint8)
aa_encoding = dict(zip(aas, encoded_aas))
continue
[aa,*scores] = matrix_line
substitution_scores[aa_encoding[aa]].update(dict(zip(encoded_aas, scores)))
substitution_scores.keys()
lines = (line for line in blosum62_str.split('\n') if line)
matrix_lines = (line.strip().split() for line in lines if line[0] != '#')
substitution_scores = defaultdict(lambda: defaultdict(int))
for idx,matrix_line in enumerate(matrix_lines):
if idx == 0:
aas = matrix_line
encoded_aas = np.array([*''.join(aas).encode()], dtype=np.uint8)
aa_encoding = dict(zip(aas, encoded_aas))
continue
[aa,*scores] = matrix_line
substitution_scores[aa_encoding[aa]].update(dict(zip(encoded_aas, scores)))
substitution_scores.keys()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[26], line 4 1 lines = (line for line in blosum62_str.split('\n') if line) 2 matrix_lines = (line.strip().split() for line in lines if line[0] != '#') ----> 4 substitution_scores = defaultdict(lambda: defaultdict(int)) 5 for idx,matrix_line in enumerate(matrix_lines): 6 if idx == 0: NameError: name 'defaultdict' is not defined
In [27]:
Copied!
d = defaultdict(lambda: defaultdict(int))
d[0].update(dict(a=1))
d
d = defaultdict(lambda: defaultdict(int))
d[0].update(dict(a=1))
d
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[27], line 1 ----> 1 d = defaultdict(lambda: defaultdict(int)) 2 d[0].update(dict(a=1)) 3 d NameError: name 'defaultdict' is not defined
In [28]:
Copied!
aa_encoding
aa_encoding
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[28], line 1 ----> 1 aa_encoding NameError: name 'aa_encoding' is not defined
In [29]:
Copied!
np.array([45],dtype=np.uint8).view('S1')[0].decode()
np.array([45],dtype=np.uint8).view('S1')[0].decode()
Out[29]:
'-'